From e726563d25c5fd3570b965bbbc85bb4321482bf5 Mon Sep 17 00:00:00 2001 From: Jonathan Alvarez Delgado Date: Thu, 16 Apr 2026 18:08:35 +0200 Subject: [PATCH 1/4] feat(pulumi): add monitoring baseline with explicit CW alarms --- infra/pulumi/__main__.py | 1006 ++++++++++++++++++++++++++++++++ infra/pulumi/config.stage.yaml | 46 ++ 2 files changed, 1052 insertions(+) diff --git a/infra/pulumi/__main__.py b/infra/pulumi/__main__.py index 9baacf48f1f3..aaa8315145a8 100755 --- a/infra/pulumi/__main__.py +++ b/infra/pulumi/__main__.py @@ -768,6 +768,7 @@ def main(): # Dedicated stage broker replacing the production EC2 RabbitMQ that # atn/stage/celery_broker previously pointed to (issue #375) mq_config = resources.get("aws:mq:RabbitMQBroker", {}) + mq_broker = None if mq_config and private_subnets and vpc_resource: mq_creds_secret_name = mq_config.get("credentials_secret_name") @@ -885,6 +886,1011 @@ def main(): ), ) + # ========================================================================= + # Monitoring and Alarms (prod-gating baseline) + # ========================================================================= + # Phase 1 observability: SNS notification path, CloudWatch alarms for + # ALB/TG/ECS/MQ/Redis, and one operational dashboard + # + # All alarms are written explicitly (not via CloudWatchMonitoringGroup) + # for a single SNS topic, full control over alarm descriptions, and + # correct metric names (upstream tb_pulumi has a target_5xx metric bug) + # + # Thresholds live in config.stage.yaml under resources.monitoring.alarms + monitoring_cfg = resources.get("monitoring", {}) + alarm_cfg = monitoring_cfg.get("alarms", {}) + + if monitoring_cfg and fargate_services: + notify_secret_name = monitoring_cfg.get("notify_emails_secret_name") + notify_emails = [] + if notify_secret_name: + notify_emails_raw = aws.secretsmanager.get_secret_version( + secret_id=notify_secret_name, + ) + notify_emails = [ + e.strip() + for e in notify_emails_raw.secret_string.split(",") + if e.strip() + ] + + # ----------------------------------------------------------------- + # SNS topic + email subscriptions + # ----------------------------------------------------------------- + alarm_topic = aws.sns.Topic( + f"{project.name_prefix}-alarm-topic", + name=f"{project.name_prefix}-alarms", + tags={ + **project.common_tags, + "Name": f"{project.name_prefix}-alarms", + }, + ) + + for idx, email in enumerate(notify_emails): + aws.sns.TopicSubscription( + f"{project.name_prefix}-alarm-sub-{idx}", + protocol="email", + endpoint=email, + topic=alarm_topic.arn, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic]), + ) + + # ----------------------------------------------------------------- + # ALB alarms (web, versioncheck) + # ----------------------------------------------------------------- + alb_cfg = alarm_cfg.get("alb", {}) + alb_error_threshold = alb_cfg.get("error_threshold", 10) + alb_error_period = alb_cfg.get("error_period", 60) + alb_rt_threshold = alb_cfg.get("response_time_threshold", 1) + alb_rt_period = alb_cfg.get("response_time_period", 60) + alb_eval_periods = alb_cfg.get("evaluation_periods", 2) + + for svc_name in ["web", "versioncheck"]: + fargate_svc = fargate_services.get(svc_name) + if not fargate_svc: + continue + svc_alb = fargate_svc.resources.get("fargate_service_alb") + if not svc_alb: + continue + alb = svc_alb.resources["albs"].get(svc_name) + if not alb: + continue + + lb_suffix = alb.arn_suffix + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-alb-5xx", + name=f"{project.name_prefix}-{svc_name}-alb-5xx", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"LoadBalancer": lb_suffix}, + metric_name="HTTPCode_ELB_5XX_Count", + namespace="AWS/ApplicationELB", + statistic="Sum", + threshold=alb_error_threshold, + period=alb_error_period, + evaluation_periods=alb_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Elevated 5xx errors on the {svc_name} ALB. " + "Check: ECS task health in console, then " + "application logs in CloudWatch for stack traces." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, alb]), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-target-5xx", + name=f"{project.name_prefix}-{svc_name}-target-5xx", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"LoadBalancer": lb_suffix}, + metric_name="HTTPCode_Target_5XX_Count", + namespace="AWS/ApplicationELB", + statistic="Sum", + threshold=alb_error_threshold, + period=alb_error_period, + evaluation_periods=alb_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Elevated 5xx errors from {svc_name} application targets. " + "Check: application logs for exceptions, database " + "connectivity, and upstream dependency health." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, alb]), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-response-time", + name=f"{project.name_prefix}-{svc_name}-response-time", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"LoadBalancer": lb_suffix}, + metric_name="TargetResponseTime", + namespace="AWS/ApplicationELB", + statistic="Average", + threshold=alb_rt_threshold, + period=alb_rt_period, + evaluation_periods=alb_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Average response time above {alb_rt_threshold}s on {svc_name}. " + "Check: is traffic elevated? Are database queries slow? " + "Is Memcached reachable?" + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, alb]), + ) + + # ----------------------------------------------------------------- + # Target group alarms (web, versioncheck) + # ----------------------------------------------------------------- + tg_cfg = alarm_cfg.get("target_group", {}) + tg_unhealthy_threshold = tg_cfg.get("unhealthy_threshold", 1) + tg_period = tg_cfg.get("period", 60) + tg_eval_periods = tg_cfg.get("evaluation_periods", 2) + + for svc_name in ["web", "versioncheck"]: + fargate_svc = fargate_services.get(svc_name) + if not fargate_svc: + continue + svc_alb = fargate_svc.resources.get("fargate_service_alb") + if not svc_alb: + continue + alb = svc_alb.resources["albs"].get(svc_name) + tg = svc_alb.resources["target_groups"].get(svc_name) + if not alb or not tg: + continue + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-unhealthy-hosts", + name=f"{project.name_prefix}-{svc_name}-unhealthy-hosts", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={ + "TargetGroup": tg.arn_suffix, + "LoadBalancer": alb.arn_suffix, + }, + metric_name="UnHealthyHostCount", + namespace="AWS/ApplicationELB", + statistic="Average", + threshold=tg_unhealthy_threshold, + period=tg_period, + evaluation_periods=tg_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Unhealthy hosts detected in {svc_name} target group. " + "Check: ECS task status, health check endpoint " + "(/services/monitor.json), container logs." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, tg]), + ) + + # ----------------------------------------------------------------- + # ECS service alarms (web, worker, versioncheck) + # ----------------------------------------------------------------- + ecs_cfg = alarm_cfg.get("ecs", {}) + ecs_cpu_threshold = ecs_cfg.get("cpu_threshold", 80) + ecs_mem_threshold = ecs_cfg.get("memory_threshold", 80) + ecs_period = ecs_cfg.get("period", 300) + ecs_eval_periods = ecs_cfg.get("evaluation_periods", 2) + + for svc_name, fargate_svc in fargate_services.items(): + ecs_service = fargate_svc.resources.get("service") + ecs_cluster = fargate_svc.resources.get("cluster") + if not ecs_service or not ecs_cluster: + continue + + cluster_name = ecs_cluster.arn.apply(lambda arn: arn.split("/")[-1]) + service_name = ecs_service.name + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-ecs-cpu", + name=f"{project.name_prefix}-{svc_name}-ecs-cpu", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={ + "ClusterName": cluster_name, + "ServiceName": service_name, + }, + metric_name="CPUUtilization", + namespace="AWS/ECS", + statistic="Average", + threshold=ecs_cpu_threshold, + period=ecs_period, + evaluation_periods=ecs_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"CPU utilisation above {ecs_cpu_threshold}% on {svc_name} service. " + "Check: is traffic elevated? Are tasks stuck? " + "Consider scaling if sustained." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, ecs_service]), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-ecs-memory", + name=f"{project.name_prefix}-{svc_name}-ecs-memory", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={ + "ClusterName": cluster_name, + "ServiceName": service_name, + }, + metric_name="MemoryUtilization", + namespace="AWS/ECS", + statistic="Average", + threshold=ecs_mem_threshold, + period=ecs_period, + evaluation_periods=ecs_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Memory utilisation above {ecs_mem_threshold}% on {svc_name} service. " + "Check: application memory leaks, task resource limits, " + "consider scaling." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, ecs_service]), + ) + + # ----------------------------------------------------------------- + # Amazon MQ alarms + # ----------------------------------------------------------------- + mq_cfg = alarm_cfg.get("mq", {}) + + if mq_broker is not None: + mq_queue_name = mq_cfg.get("queue_name", "olympia") + mq_vhost = mq_cfg.get("virtual_host", "/") + mq_msg_threshold = mq_cfg.get("message_ready_threshold", 1000) + mq_consumer_alarm_enabled = mq_cfg.get("consumer_alarm_enabled", False) + mq_consumer_threshold = mq_cfg.get("consumer_count_threshold", 1) + mq_cpu_threshold = mq_cfg.get("cpu_threshold", 80) + mq_mem_threshold = mq_cfg.get("memory_bytes_threshold", 512000000) + mq_period = mq_cfg.get("period", 300) + mq_eval_periods = mq_cfg.get("evaluation_periods", 2) + + broker_id = mq_broker.id + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-mq-message-ready", + name=f"{project.name_prefix}-mq-message-ready", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={ + "Broker": broker_id, + "VirtualHost": mq_vhost, + "Queue": mq_queue_name, + }, + metric_name="MessageReadyCount", + namespace="AWS/AmazonMQ", + statistic="Average", + threshold=mq_msg_threshold, + period=mq_period, + evaluation_periods=mq_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Queue '{mq_queue_name}' has over {mq_msg_threshold} " + "ready messages. Check: is the worker consuming? " + "Are tasks backing up? Check worker logs." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, mq_broker]), + ) + + if mq_consumer_alarm_enabled: + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-mq-consumer-count", + name=f"{project.name_prefix}-mq-consumer-count", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="LessThanThreshold", + dimensions={ + "Broker": broker_id, + "VirtualHost": mq_vhost, + "Queue": mq_queue_name, + }, + metric_name="ConsumerCount", + namespace="AWS/AmazonMQ", + statistic="Minimum", + threshold=mq_consumer_threshold, + period=mq_period, + evaluation_periods=mq_eval_periods, + treat_missing_data="breaching", + alarm_description=( + f"No consumers connected to the '{mq_queue_name}' queue. " + "Check: is the worker service running? " + "Check worker logs for connection errors." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, mq_broker]), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-mq-cpu", + name=f"{project.name_prefix}-mq-cpu", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"Broker": broker_id}, + metric_name="SystemCpuUtilization", + namespace="AWS/AmazonMQ", + statistic="Average", + threshold=mq_cpu_threshold, + period=mq_period, + evaluation_periods=mq_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Broker CPU above {mq_cpu_threshold}%. Check: " + "queue depth, message throughput, consider " + "upgrading instance type if sustained." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, mq_broker]), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-mq-memory", + name=f"{project.name_prefix}-mq-memory", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"Broker": broker_id}, + metric_name="RabbitMQMemUsed", + namespace="AWS/AmazonMQ", + statistic="Average", + threshold=mq_mem_threshold, + period=mq_period, + evaluation_periods=mq_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Broker memory above {mq_mem_threshold} bytes. " + "Check: queue depth and message sizes, consider " + "purging stale queues or upgrading instance." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, mq_broker]), + ) + + # ----------------------------------------------------------------- + # Redis alarms + # ----------------------------------------------------------------- + redis_cfg = alarm_cfg.get("redis", {}) + redis_cluster = elasticache_clusters.get("redis") + + if redis_cluster: + redis_mem_threshold = redis_cfg.get("memory_pct_threshold", 80) + redis_eviction_threshold = redis_cfg.get("eviction_threshold", 100) + redis_cpu_threshold = redis_cfg.get("cpu_threshold", 80) + redis_host_cpu_threshold = redis_cfg.get("host_cpu_threshold", 90) + redis_conn_threshold = redis_cfg.get("connection_threshold", 500) + redis_period = redis_cfg.get("period", 300) + redis_eval_periods = redis_cfg.get("evaluation_periods", 2) + + replication_group = redis_cluster.resources["replication_group"] + cache_cluster_id = replication_group.id.apply(lambda rg_id: f"{rg_id}-001") + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-redis-memory", + name=f"{project.name_prefix}-redis-memory", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"CacheClusterId": cache_cluster_id}, + metric_name="DatabaseMemoryUsagePercentage", + namespace="AWS/ElastiCache", + statistic="Average", + threshold=redis_mem_threshold, + period=redis_period, + evaluation_periods=redis_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Redis memory usage above {redis_mem_threshold}%. " + "Check: eviction count, key count growth, " + "potential memory leak in application cache usage." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions( + depends_on=[alarm_topic, replication_group] + ), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-redis-evictions", + name=f"{project.name_prefix}-redis-evictions", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"CacheClusterId": cache_cluster_id}, + metric_name="Evictions", + namespace="AWS/ElastiCache", + statistic="Sum", + threshold=redis_eviction_threshold, + period=redis_period, + evaluation_periods=redis_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Redis evictions above {redis_eviction_threshold} " + "per period. Check: memory usage, maxmemory-policy, " + "whether the application is over-caching." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions( + depends_on=[alarm_topic, replication_group] + ), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-redis-cpu", + name=f"{project.name_prefix}-redis-cpu", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"CacheClusterId": cache_cluster_id}, + metric_name="EngineCPUUtilization", + namespace="AWS/ElastiCache", + statistic="Average", + threshold=redis_cpu_threshold, + period=redis_period, + evaluation_periods=redis_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Redis engine CPU above {redis_cpu_threshold}%. " + "Check: command complexity (KEYS, SORT), " + "connection count, consider node upgrade." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions( + depends_on=[alarm_topic, replication_group] + ), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-redis-connections", + name=f"{project.name_prefix}-redis-connections", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"CacheClusterId": cache_cluster_id}, + metric_name="CurrConnections", + namespace="AWS/ElastiCache", + statistic="Average", + threshold=redis_conn_threshold, + period=redis_period, + evaluation_periods=redis_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Redis connections above {redis_conn_threshold}. " + "Check: connection pool settings, task/service " + "count, potential connection leaks." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions( + depends_on=[alarm_topic, replication_group] + ), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-redis-host-cpu", + name=f"{project.name_prefix}-redis-host-cpu", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"CacheClusterId": cache_cluster_id}, + metric_name="CPUUtilization", + namespace="AWS/ElastiCache", + statistic="Average", + threshold=redis_host_cpu_threshold, + period=redis_period, + evaluation_periods=redis_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Redis host CPU above {redis_host_cpu_threshold}%. " + "This monitors the underlying host, not just the Redis " + "engine. On nodes with <= 2 vCPUs, EngineCPUUtilization " + "alone can miss host overload. Check: background processes, " + "node type, consider upgrading" + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions( + depends_on=[alarm_topic, replication_group] + ), + ) + + # ----------------------------------------------------------------- + # CloudWatch Dashboard + # ----------------------------------------------------------------- + dash_cfg = monitoring_cfg.get("dashboard", {}) + dash_period = dash_cfg.get("period", 300) + + dashboard_outputs = {} + for svc_name in ["web", "versioncheck"]: + fargate_svc = fargate_services.get(svc_name) + if fargate_svc: + svc_alb = fargate_svc.resources.get("fargate_service_alb") + if svc_alb: + alb = svc_alb.resources["albs"].get(svc_name) + if alb: + dashboard_outputs[f"{svc_name}_alb_suffix"] = alb.arn_suffix + svc_res = fargate_svc.resources.get("service") + cluster_res = fargate_svc.resources.get("cluster") + if svc_res: + dashboard_outputs[f"{svc_name}_svc_name"] = svc_res.name + if cluster_res: + dashboard_outputs[f"{svc_name}_cluster"] = cluster_res.arn.apply( + lambda arn: arn.split("/")[-1] + ) + + worker_svc = fargate_services.get("worker") + if worker_svc: + svc_res = worker_svc.resources.get("service") + cluster_res = worker_svc.resources.get("cluster") + if svc_res: + dashboard_outputs["worker_svc_name"] = svc_res.name + if cluster_res: + dashboard_outputs["worker_cluster"] = cluster_res.arn.apply( + lambda arn: arn.split("/")[-1] + ) + + if mq_broker is not None: + dashboard_outputs["mq_broker_id"] = mq_broker.id + + if redis_cluster: + dashboard_outputs["redis_cluster_id"] = redis_cluster.resources[ + "replication_group" + ].id.apply(lambda rg_id: f"{rg_id}-001") + + mq_queue = alarm_cfg.get("mq", {}).get("queue_name", "olympia") + mq_vhost_dash = alarm_cfg.get("mq", {}).get("virtual_host", "/") + region = project.aws_region + + if dashboard_outputs: + dashboard_body = pulumi.Output.all(**dashboard_outputs).apply( + lambda o: json.dumps( + { + "widgets": [ + *( + [ + { + "type": "metric", + "x": 0, + "y": 0, + "width": 12, + "height": 6, + "properties": { + "title": "Web ALB - Requests and Errors", + "region": region, + "period": dash_period, + "metrics": [ + [ + "AWS/ApplicationELB", + "RequestCount", + "LoadBalancer", + o["web_alb_suffix"], + {"stat": "Sum"}, + ], + [ + "AWS/ApplicationELB", + "HTTPCode_ELB_5XX_Count", + "LoadBalancer", + o["web_alb_suffix"], + {"stat": "Sum"}, + ], + [ + "AWS/ApplicationELB", + "HTTPCode_Target_5XX_Count", + "LoadBalancer", + o["web_alb_suffix"], + {"stat": "Sum"}, + ], + [ + "AWS/ApplicationELB", + "TargetResponseTime", + "LoadBalancer", + o["web_alb_suffix"], + { + "stat": "Average", + "yAxis": "right", + }, + ], + ], + "yAxis": { + "right": { + "label": "Seconds", + "showUnits": False, + } + }, + }, + } + ] + if "web_alb_suffix" in o + else [] + ), + *( + [ + { + "type": "metric", + "x": 12, + "y": 0, + "width": 12, + "height": 6, + "properties": { + "title": "Versioncheck ALB - Requests and Errors", + "region": region, + "period": dash_period, + "metrics": [ + [ + "AWS/ApplicationELB", + "RequestCount", + "LoadBalancer", + o["versioncheck_alb_suffix"], + {"stat": "Sum"}, + ], + [ + "AWS/ApplicationELB", + "HTTPCode_ELB_5XX_Count", + "LoadBalancer", + o["versioncheck_alb_suffix"], + {"stat": "Sum"}, + ], + [ + "AWS/ApplicationELB", + "HTTPCode_Target_5XX_Count", + "LoadBalancer", + o["versioncheck_alb_suffix"], + {"stat": "Sum"}, + ], + [ + "AWS/ApplicationELB", + "TargetResponseTime", + "LoadBalancer", + o["versioncheck_alb_suffix"], + { + "stat": "Average", + "yAxis": "right", + }, + ], + ], + "yAxis": { + "right": { + "label": "Seconds", + "showUnits": False, + } + }, + }, + } + ] + if "versioncheck_alb_suffix" in o + else [] + ), + *( + [ + { + "type": "metric", + "x": 0, + "y": 6, + "width": 8, + "height": 6, + "properties": { + "title": "Web ECS - CPU and Memory", + "region": region, + "period": dash_period, + "metrics": [ + [ + "AWS/ECS", + "CPUUtilization", + "ClusterName", + o["web_cluster"], + "ServiceName", + o["web_svc_name"], + {"stat": "Average"}, + ], + [ + "AWS/ECS", + "MemoryUtilization", + "ClusterName", + o["web_cluster"], + "ServiceName", + o["web_svc_name"], + {"stat": "Average"}, + ], + ], + }, + } + ] + if "web_cluster" in o and "web_svc_name" in o + else [] + ), + *( + [ + { + "type": "metric", + "x": 8, + "y": 6, + "width": 8, + "height": 6, + "properties": { + "title": "Worker ECS - CPU and Memory", + "region": region, + "period": dash_period, + "metrics": [ + [ + "AWS/ECS", + "CPUUtilization", + "ClusterName", + o["worker_cluster"], + "ServiceName", + o["worker_svc_name"], + {"stat": "Average"}, + ], + [ + "AWS/ECS", + "MemoryUtilization", + "ClusterName", + o["worker_cluster"], + "ServiceName", + o["worker_svc_name"], + {"stat": "Average"}, + ], + ], + }, + } + ] + if "worker_cluster" in o and "worker_svc_name" in o + else [] + ), + *( + [ + { + "type": "metric", + "x": 16, + "y": 6, + "width": 8, + "height": 6, + "properties": { + "title": "Versioncheck ECS - CPU and Memory", + "region": region, + "period": dash_period, + "metrics": [ + [ + "AWS/ECS", + "CPUUtilization", + "ClusterName", + o["versioncheck_cluster"], + "ServiceName", + o["versioncheck_svc_name"], + {"stat": "Average"}, + ], + [ + "AWS/ECS", + "MemoryUtilization", + "ClusterName", + o["versioncheck_cluster"], + "ServiceName", + o["versioncheck_svc_name"], + {"stat": "Average"}, + ], + ], + }, + } + ] + if "versioncheck_cluster" in o + and "versioncheck_svc_name" in o + else [] + ), + *( + [ + { + "type": "metric", + "x": 0, + "y": 12, + "width": 12, + "height": 6, + "properties": { + "title": "Amazon MQ - Queue Health", + "region": region, + "period": dash_period, + "metrics": [ + [ + "AWS/AmazonMQ", + "MessageReadyCount", + "Broker", + o["mq_broker_id"], + "VirtualHost", + mq_vhost_dash, + "Queue", + mq_queue, + {"stat": "Average"}, + ], + [ + "AWS/AmazonMQ", + "MessageUnacknowledgedCount", + "Broker", + o["mq_broker_id"], + "VirtualHost", + mq_vhost_dash, + "Queue", + mq_queue, + {"stat": "Average"}, + ], + [ + "AWS/AmazonMQ", + "ConsumerCount", + "Broker", + o["mq_broker_id"], + "VirtualHost", + mq_vhost_dash, + "Queue", + mq_queue, + { + "stat": "Minimum", + "yAxis": "right", + }, + ], + ], + "yAxis": { + "right": { + "label": "Consumers", + "showUnits": False, + } + }, + }, + }, + { + "type": "metric", + "x": 12, + "y": 12, + "width": 12, + "height": 6, + "properties": { + "title": "Amazon MQ - Broker Resources", + "region": region, + "period": dash_period, + "metrics": [ + [ + "AWS/AmazonMQ", + "SystemCpuUtilization", + "Broker", + o["mq_broker_id"], + {"stat": "Average"}, + ], + [ + "AWS/AmazonMQ", + "RabbitMQMemUsed", + "Broker", + o["mq_broker_id"], + { + "stat": "Average", + "yAxis": "right", + }, + ], + ], + "yAxis": { + "right": { + "label": "Bytes", + "showUnits": False, + } + }, + }, + }, + ] + if "mq_broker_id" in o + else [] + ), + *( + [ + { + "type": "metric", + "x": 0, + "y": 18, + "width": 12, + "height": 6, + "properties": { + "title": "Redis - Memory and Evictions", + "region": region, + "period": dash_period, + "metrics": [ + [ + "AWS/ElastiCache", + "DatabaseMemoryUsagePercentage", + "CacheClusterId", + o["redis_cluster_id"], + {"stat": "Average"}, + ], + [ + "AWS/ElastiCache", + "Evictions", + "CacheClusterId", + o["redis_cluster_id"], + {"stat": "Sum", "yAxis": "right"}, + ], + ], + "yAxis": { + "right": { + "label": "Count", + "showUnits": False, + } + }, + }, + }, + { + "type": "metric", + "x": 12, + "y": 18, + "width": 12, + "height": 6, + "properties": { + "title": "Redis - CPU and Connections", + "region": region, + "period": dash_period, + "metrics": [ + [ + "AWS/ElastiCache", + "EngineCPUUtilization", + "CacheClusterId", + o["redis_cluster_id"], + {"stat": "Average"}, + ], + [ + "AWS/ElastiCache", + "CPUUtilization", + "CacheClusterId", + o["redis_cluster_id"], + {"stat": "Average"}, + ], + [ + "AWS/ElastiCache", + "CurrConnections", + "CacheClusterId", + o["redis_cluster_id"], + { + "stat": "Average", + "yAxis": "right", + }, + ], + ], + "yAxis": { + "right": { + "label": "Connections", + "showUnits": False, + } + }, + }, + }, + ] + if "redis_cluster_id" in o + else [] + ), + ], + } + ) + ) + + aws.cloudwatch.Dashboard( + f"{project.name_prefix}-dashboard", + dashboard_name=f"{project.name_prefix}-health", + dashboard_body=dashboard_body, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic]), + ) + + # ----------------------------------------------------------------- + # Monitoring exports + # ----------------------------------------------------------------- + pulumi.export("monitoring_sns_topic_arn", alarm_topic.arn) + pulumi.export( + "monitoring_dashboard_name", + f"{project.name_prefix}-health", + ) + # ========================================================================= # ECS Scheduled Tasks (Cron Jobs) # ========================================================================= diff --git a/infra/pulumi/config.stage.yaml b/infra/pulumi/config.stage.yaml index fa40c65c9171..75c9a6d8ef3a 100644 --- a/infra/pulumi/config.stage.yaml +++ b/infra/pulumi/config.stage.yaml @@ -638,6 +638,52 @@ resources: # - efs_filesystem_id -- replaced by Pulumi-managed aws:efs:FileSystem # - mq_admin_password -- replaced by mq_credentials (JSON) + # ============================================================================= + # Monitoring and Alarms (prod-gating baseline) + # ============================================================================= + # Phase 1: ALB, target group, ECS service, Amazon MQ, Redis alarms + # with a single CloudWatch dashboard for is-stage-healthy? triage + # Phase 2 (future): EFS, log metric filters, deployment instability, + # external/shared resources (RDS, Memcached, OpenSearch) + monitoring: + notify_emails_secret_name: atn/stage/monitoring_notify_emails + alarms: + alb: + error_threshold: 10 + error_period: 60 + response_time_threshold: 1 + response_time_period: 60 + evaluation_periods: 2 + target_group: + unhealthy_threshold: 1 + period: 60 + evaluation_periods: 2 + ecs: + cpu_threshold: 80 + memory_threshold: 80 + period: 300 + evaluation_periods: 2 + mq: + message_ready_threshold: 1000 + consumer_alarm_enabled: false # worker desired_count is intentionally 0 in current stage posture + consumer_count_threshold: 1 + cpu_threshold: 80 + memory_bytes_threshold: 512000000 # ~512 MB (mq.t3.micro has 1 GB) + period: 300 + evaluation_periods: 2 + queue_name: olympia + virtual_host: / + redis: + memory_pct_threshold: 80 + eviction_threshold: 100 + cpu_threshold: 80 + host_cpu_threshold: 90 # cache.t3.small has 2 vCPUs; AWS recommends monitoring host CPUUtilization alongside EngineCPUUtilization on nodes with <= 2 vCPUs + connection_threshold: 500 + period: 300 + evaluation_periods: 2 + dashboard: + period: 300 + # ============================================================================= # Notes for implementation: # ============================================================================= From 56b64b1c04a2fc4b32b420f4c0c7d1f473837ca9 Mon Sep 17 00:00:00 2001 From: Jonathan Alvarez Delgado Date: Wed, 29 Apr 2026 01:33:29 +0200 Subject: [PATCH 2/4] fix(pulumi): correct MQ broker dimension and add availability alarms --- infra/pulumi/__main__.py | 154 ++++++++++++++++++++++++++++++++++----- 1 file changed, 137 insertions(+), 17 deletions(-) diff --git a/infra/pulumi/__main__.py b/infra/pulumi/__main__.py index aaa8315145a8..bcfb3369d3fa 100755 --- a/infra/pulumi/__main__.py +++ b/infra/pulumi/__main__.py @@ -934,6 +934,33 @@ def main(): opts=pulumi.ResourceOptions(depends_on=[alarm_topic]), ) + # If notifications fail to deliver, every other alarm in this stack is + # also silently undelivered. We publish this alarm to the same topic + # for CloudWatch-console visibility; for Phase 2 a secondary channel + # (SMS, Slack, separate topic) should provide an independent path + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-alarm-topic-delivery-failures", + name=f"{project.name_prefix}-alarm-topic-delivery-failures", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"TopicName": alarm_topic.name}, + metric_name="NumberOfNotificationsFailed", + namespace="AWS/SNS", + statistic="Sum", + threshold=1, + period=300, + evaluation_periods=1, + treat_missing_data="notBreaching", + alarm_description=( + "One or more alarm notifications failed delivery from the " + "stage alarm topic. Check: SNS subscription confirmations, " + "recipient email validity, topic policy" + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic]), + ) + # ----------------------------------------------------------------- # ALB alarms (web, versioncheck) # ----------------------------------------------------------------- @@ -952,10 +979,12 @@ def main(): if not svc_alb: continue alb = svc_alb.resources["albs"].get(svc_name) - if not alb: + tg = svc_alb.resources["target_groups"].get(svc_name) + if not alb or not tg: continue lb_suffix = alb.arn_suffix + tg_suffix = tg.arn_suffix aws.cloudwatch.MetricAlarm( f"{project.name_prefix}-{svc_name}-alb-5xx", @@ -986,7 +1015,10 @@ def main(): alarm_actions=[alarm_topic.arn], ok_actions=[alarm_topic.arn], comparison_operator="GreaterThanOrEqualToThreshold", - dimensions={"LoadBalancer": lb_suffix}, + dimensions={ + "LoadBalancer": lb_suffix, + "TargetGroup": tg_suffix, + }, metric_name="HTTPCode_Target_5XX_Count", namespace="AWS/ApplicationELB", statistic="Sum", @@ -1009,7 +1041,10 @@ def main(): alarm_actions=[alarm_topic.arn], ok_actions=[alarm_topic.arn], comparison_operator="GreaterThanOrEqualToThreshold", - dimensions={"LoadBalancer": lb_suffix}, + dimensions={ + "LoadBalancer": lb_suffix, + "TargetGroup": tg_suffix, + }, metric_name="TargetResponseTime", namespace="AWS/ApplicationELB", statistic="Average", @@ -1062,6 +1097,9 @@ def main(): threshold=tg_unhealthy_threshold, period=tg_period, evaluation_periods=tg_eval_periods, + # Positive availability is covered by the healthy-hosts alarm + # below; here we want elevated unhealthy hosts even when at + # least one single healthy host remains treat_missing_data="notBreaching", alarm_description=( f"Unhealthy hosts detected in {svc_name} target group. " @@ -1072,6 +1110,36 @@ def main(): opts=pulumi.ResourceOptions(depends_on=[alarm_topic, tg]), ) + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-healthy-hosts", + name=f"{project.name_prefix}-{svc_name}-healthy-hosts", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="LessThanThreshold", + dimensions={ + "TargetGroup": tg.arn_suffix, + "LoadBalancer": alb.arn_suffix, + }, + metric_name="HealthyHostCount", + namespace="AWS/ApplicationELB", + statistic="Minimum", + threshold=tg_cfg.get("healthy_threshold", 1), + period=tg_period, + evaluation_periods=tg_eval_periods, + # Missing data on this metric means the target group has no + # registered targets -- operationally indistinguishable from + # zero healthy hosts and therefore treated as breaching + treat_missing_data="breaching", + alarm_description=( + f"No healthy hosts in {svc_name} target group. " + "Check: is the ECS service running, is the container " + "health-check responding (/services/monitor.json), is " + "the SG allowing traffic from the ALB?" + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, tg]), + ) + # ----------------------------------------------------------------- # ECS service alarms (web, worker, versioncheck) # ----------------------------------------------------------------- @@ -1080,6 +1148,7 @@ def main(): ecs_mem_threshold = ecs_cfg.get("memory_threshold", 80) ecs_period = ecs_cfg.get("period", 300) ecs_eval_periods = ecs_cfg.get("evaluation_periods", 2) + ecs_min_tasks = ecs_cfg.get("min_tasks", 1) for svc_name, fargate_svc in fargate_services.items(): ecs_service = fargate_svc.resources.get("service") @@ -1132,6 +1201,8 @@ def main(): threshold=ecs_mem_threshold, period=ecs_period, evaluation_periods=ecs_eval_periods, + # Positive availability is covered by the running-tasks alarm + # below; CPU/memory only matter while tasks exist treat_missing_data="notBreaching", alarm_description=( f"Memory utilisation above {ecs_mem_threshold}% on {svc_name} service. " @@ -1142,6 +1213,38 @@ def main(): opts=pulumi.ResourceOptions(depends_on=[alarm_topic, ecs_service]), ) + # Container Insights publishes RunningTaskCount per service in the + # ECS/ContainerInsights namespace. Operators draining a service + # intentionally should override `min_tasks` per service in config + # or temporarily disable this alarm + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-running-tasks", + name=f"{project.name_prefix}-{svc_name}-running-tasks", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="LessThanThreshold", + dimensions={ + "ClusterName": cluster_name, + "ServiceName": service_name, + }, + metric_name="RunningTaskCount", + namespace="ECS/ContainerInsights", + statistic="Minimum", + threshold=ecs_min_tasks, + period=ecs_period, + evaluation_periods=ecs_eval_periods, + # Container Insights stops emitting when a service is fully + # drained; that is exactly the failure we want to catch + treat_missing_data="breaching", + alarm_description=( + f"Running task count below {ecs_min_tasks} on {svc_name}. " + "Check: deployment status, service events for stop " + "reasons, scheduled actions, task health" + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, ecs_service]), + ) + # ----------------------------------------------------------------- # Amazon MQ alarms # ----------------------------------------------------------------- @@ -1158,7 +1261,12 @@ def main(): mq_period = mq_cfg.get("period", 300) mq_eval_periods = mq_cfg.get("evaluation_periods", 2) - broker_id = mq_broker.id + # AWS publishes Amazon MQ for RabbitMQ metrics with the `Broker` + # dimension set to the broker name, not the broker ID. The Pulumi + # `aws.mq.Broker.id` output is the AWS broker UUID (e.g. + # b-xxxxxxxx-...) which would point at a non-existent metric + # series + broker_name = mq_broker.broker_name aws.cloudwatch.MetricAlarm( f"{project.name_prefix}-mq-message-ready", @@ -1167,7 +1275,7 @@ def main(): ok_actions=[alarm_topic.arn], comparison_operator="GreaterThanOrEqualToThreshold", dimensions={ - "Broker": broker_id, + "Broker": broker_name, "VirtualHost": mq_vhost, "Queue": mq_queue_name, }, @@ -1195,7 +1303,7 @@ def main(): ok_actions=[alarm_topic.arn], comparison_operator="LessThanThreshold", dimensions={ - "Broker": broker_id, + "Broker": broker_name, "VirtualHost": mq_vhost, "Queue": mq_queue_name, }, @@ -1221,14 +1329,16 @@ def main(): alarm_actions=[alarm_topic.arn], ok_actions=[alarm_topic.arn], comparison_operator="GreaterThanOrEqualToThreshold", - dimensions={"Broker": broker_id}, + dimensions={"Broker": broker_name}, metric_name="SystemCpuUtilization", namespace="AWS/AmazonMQ", statistic="Average", threshold=mq_cpu_threshold, period=mq_period, evaluation_periods=mq_eval_periods, - treat_missing_data="notBreaching", + # Managed broker emits resource metrics whenever it is RUNNING; + # absence of data indicates the broker itself is in trouble. + treat_missing_data="breaching", alarm_description=( f"Broker CPU above {mq_cpu_threshold}%. Check: " "queue depth, message throughput, consider " @@ -1244,14 +1354,16 @@ def main(): alarm_actions=[alarm_topic.arn], ok_actions=[alarm_topic.arn], comparison_operator="GreaterThanOrEqualToThreshold", - dimensions={"Broker": broker_id}, + dimensions={"Broker": broker_name}, metric_name="RabbitMQMemUsed", namespace="AWS/AmazonMQ", statistic="Average", threshold=mq_mem_threshold, period=mq_period, evaluation_periods=mq_eval_periods, - treat_missing_data="notBreaching", + # Same rationale as mq-cpu: missing data on a managed broker + # is itself a failure signal. + treat_missing_data="breaching", alarm_description=( f"Broker memory above {mq_mem_threshold} bytes. " "Check: queue depth and message sizes, consider " @@ -1277,6 +1389,12 @@ def main(): redis_eval_periods = redis_cfg.get("evaluation_periods", 2) replication_group = redis_cluster.resources["replication_group"] + # ElastiCache publishes per-node metrics under the cache cluster ID, + # which for a single-node replication group is `-001`. Verify + # at first deploy by reading one CloudWatch datapoint for the + # alarms below; if the dimension value does not match an emitted + # series, switch to `replication_group.member_clusters[0]` (a list + # output that holds the actual cache cluster IDs) cache_cluster_id = replication_group.id.apply(lambda rg_id: f"{rg_id}-001") aws.cloudwatch.MetricAlarm( @@ -1442,7 +1560,9 @@ def main(): ) if mq_broker is not None: - dashboard_outputs["mq_broker_id"] = mq_broker.id + # Dashboard widgets feed this value into the `Broker` CloudWatch + # dimension, which AWS keys by broker name (not the b-... UUID) + dashboard_outputs["mq_broker_name"] = mq_broker.broker_name if redis_cluster: dashboard_outputs["redis_cluster_id"] = redis_cluster.resources[ @@ -1704,7 +1824,7 @@ def main(): "AWS/AmazonMQ", "MessageReadyCount", "Broker", - o["mq_broker_id"], + o["mq_broker_name"], "VirtualHost", mq_vhost_dash, "Queue", @@ -1715,7 +1835,7 @@ def main(): "AWS/AmazonMQ", "MessageUnacknowledgedCount", "Broker", - o["mq_broker_id"], + o["mq_broker_name"], "VirtualHost", mq_vhost_dash, "Queue", @@ -1726,7 +1846,7 @@ def main(): "AWS/AmazonMQ", "ConsumerCount", "Broker", - o["mq_broker_id"], + o["mq_broker_name"], "VirtualHost", mq_vhost_dash, "Queue", @@ -1760,14 +1880,14 @@ def main(): "AWS/AmazonMQ", "SystemCpuUtilization", "Broker", - o["mq_broker_id"], + o["mq_broker_name"], {"stat": "Average"}, ], [ "AWS/AmazonMQ", "RabbitMQMemUsed", "Broker", - o["mq_broker_id"], + o["mq_broker_name"], { "stat": "Average", "yAxis": "right", @@ -1783,7 +1903,7 @@ def main(): }, }, ] - if "mq_broker_id" in o + if "mq_broker_name" in o else [] ), *( From 7882889378eed35087e74ce4e8df5932e9533ade Mon Sep 17 00:00:00 2001 From: Jonathan Alvarez Delgado Date: Wed, 29 Apr 2026 04:07:27 +0200 Subject: [PATCH 3/4] refactor(pulumi): extract dashboard widget builders to module level --- infra/pulumi/__main__.py | 676 +++++++++++++++------------------------ 1 file changed, 259 insertions(+), 417 deletions(-) diff --git a/infra/pulumi/__main__.py b/infra/pulumi/__main__.py index bcfb3369d3fa..03bc04653a1f 100755 --- a/infra/pulumi/__main__.py +++ b/infra/pulumi/__main__.py @@ -36,6 +36,263 @@ import tb_pulumi.network +# --------------------------------------------------------------------------- +# CloudWatch dashboard widget builders +# +# Each builder takes the resolved-output dict `o` (string values produced by +# pulumi.Output.all().apply) plus pure-Python layout config, and returns the +# CloudWatch dashboard widget shape as a dict (or list of dicts) +# They have no Pulumi deps and can be unit tested in isolation +# --------------------------------------------------------------------------- + + +def _alb_requests_widget(o, svc_name, region, period, x, y): + suffix = o[f"{svc_name}_alb_suffix"] + return { + "type": "metric", + "x": x, + "y": y, + "width": 12, + "height": 6, + "properties": { + "title": f"{svc_name.capitalize()} ALB - Requests and Errors", + "region": region, + "period": period, + "metrics": [ + [ + "AWS/ApplicationELB", + "RequestCount", + "LoadBalancer", + suffix, + {"stat": "Sum"}, + ], + [ + "AWS/ApplicationELB", + "HTTPCode_ELB_5XX_Count", + "LoadBalancer", + suffix, + {"stat": "Sum"}, + ], + [ + "AWS/ApplicationELB", + "HTTPCode_Target_5XX_Count", + "LoadBalancer", + suffix, + {"stat": "Sum"}, + ], + [ + "AWS/ApplicationELB", + "TargetResponseTime", + "LoadBalancer", + suffix, + {"stat": "Average", "yAxis": "right"}, + ], + ], + "yAxis": {"right": {"label": "Seconds", "showUnits": False}}, + }, + } + + +def _ecs_resources_widget(o, svc_name, region, period, x, y, width=8): + cluster = o[f"{svc_name}_cluster"] + service = o[f"{svc_name}_svc_name"] + return { + "type": "metric", + "x": x, + "y": y, + "width": width, + "height": 6, + "properties": { + "title": f"{svc_name.capitalize()} ECS - CPU and Memory", + "region": region, + "period": period, + "metrics": [ + [ + "AWS/ECS", + "CPUUtilization", + "ClusterName", + cluster, + "ServiceName", + service, + {"stat": "Average"}, + ], + [ + "AWS/ECS", + "MemoryUtilization", + "ClusterName", + cluster, + "ServiceName", + service, + {"stat": "Average"}, + ], + ], + }, + } + + +def _mq_widgets(o, region, period, queue, vhost): + broker = o["mq_broker_name"] + queue_dims = ["Broker", broker, "VirtualHost", vhost, "Queue", queue] + return [ + { + "type": "metric", + "x": 0, + "y": 12, + "width": 12, + "height": 6, + "properties": { + "title": "Amazon MQ - Queue Health", + "region": region, + "period": period, + "metrics": [ + [ + "AWS/AmazonMQ", + "MessageReadyCount", + *queue_dims, + {"stat": "Average"}, + ], + [ + "AWS/AmazonMQ", + "MessageUnacknowledgedCount", + *queue_dims, + {"stat": "Average"}, + ], + [ + "AWS/AmazonMQ", + "ConsumerCount", + *queue_dims, + {"stat": "Minimum", "yAxis": "right"}, + ], + ], + "yAxis": {"right": {"label": "Consumers", "showUnits": False}}, + }, + }, + { + "type": "metric", + "x": 12, + "y": 12, + "width": 12, + "height": 6, + "properties": { + "title": "Amazon MQ - Broker Resources", + "region": region, + "period": period, + "metrics": [ + [ + "AWS/AmazonMQ", + "SystemCpuUtilization", + "Broker", + broker, + {"stat": "Average"}, + ], + [ + "AWS/AmazonMQ", + "RabbitMQMemUsed", + "Broker", + broker, + {"stat": "Average", "yAxis": "right"}, + ], + ], + "yAxis": {"right": {"label": "Bytes", "showUnits": False}}, + }, + }, + ] + + +def _redis_widgets(o, region, period): + cluster_id = o["redis_cluster_id"] + return [ + { + "type": "metric", + "x": 0, + "y": 18, + "width": 12, + "height": 6, + "properties": { + "title": "Redis - Memory and Evictions", + "region": region, + "period": period, + "metrics": [ + [ + "AWS/ElastiCache", + "DatabaseMemoryUsagePercentage", + "CacheClusterId", + cluster_id, + {"stat": "Average"}, + ], + [ + "AWS/ElastiCache", + "Evictions", + "CacheClusterId", + cluster_id, + {"stat": "Sum", "yAxis": "right"}, + ], + ], + "yAxis": {"right": {"label": "Count", "showUnits": False}}, + }, + }, + { + "type": "metric", + "x": 12, + "y": 18, + "width": 12, + "height": 6, + "properties": { + "title": "Redis - CPU and Connections", + "region": region, + "period": period, + "metrics": [ + [ + "AWS/ElastiCache", + "EngineCPUUtilization", + "CacheClusterId", + cluster_id, + {"stat": "Average"}, + ], + [ + "AWS/ElastiCache", + "CPUUtilization", + "CacheClusterId", + cluster_id, + {"stat": "Average"}, + ], + [ + "AWS/ElastiCache", + "CurrConnections", + "CacheClusterId", + cluster_id, + {"stat": "Average", "yAxis": "right"}, + ], + ], + "yAxis": {"right": {"label": "Connections", "showUnits": False}}, + }, + }, + ] + + +def _build_dashboard_body(o, region, period, mq_queue, mq_vhost): + widgets = [] + if "web_alb_suffix" in o: + widgets.append(_alb_requests_widget(o, "web", region, period, x=0, y=0)) + if "versioncheck_alb_suffix" in o: + widgets.append( + _alb_requests_widget(o, "versioncheck", region, period, x=12, y=0) + ) + if "web_cluster" in o and "web_svc_name" in o: + widgets.append(_ecs_resources_widget(o, "web", region, period, x=0, y=6)) + if "worker_cluster" in o and "worker_svc_name" in o: + widgets.append(_ecs_resources_widget(o, "worker", region, period, x=8, y=6)) + if "versioncheck_cluster" in o and "versioncheck_svc_name" in o: + widgets.append( + _ecs_resources_widget(o, "versioncheck", region, period, x=16, y=6) + ) + if "mq_broker_name" in o: + widgets.extend(_mq_widgets(o, region, period, mq_queue, mq_vhost)) + if "redis_cluster_id" in o: + widgets.extend(_redis_widgets(o, region, period)) + return json.dumps({"widgets": widgets}) + + def main(): # Create a ThunderbirdPulumiProject to aggregate resources # This loads config.{stack}.yaml automatically @@ -1575,423 +1832,8 @@ def main(): if dashboard_outputs: dashboard_body = pulumi.Output.all(**dashboard_outputs).apply( - lambda o: json.dumps( - { - "widgets": [ - *( - [ - { - "type": "metric", - "x": 0, - "y": 0, - "width": 12, - "height": 6, - "properties": { - "title": "Web ALB - Requests and Errors", - "region": region, - "period": dash_period, - "metrics": [ - [ - "AWS/ApplicationELB", - "RequestCount", - "LoadBalancer", - o["web_alb_suffix"], - {"stat": "Sum"}, - ], - [ - "AWS/ApplicationELB", - "HTTPCode_ELB_5XX_Count", - "LoadBalancer", - o["web_alb_suffix"], - {"stat": "Sum"}, - ], - [ - "AWS/ApplicationELB", - "HTTPCode_Target_5XX_Count", - "LoadBalancer", - o["web_alb_suffix"], - {"stat": "Sum"}, - ], - [ - "AWS/ApplicationELB", - "TargetResponseTime", - "LoadBalancer", - o["web_alb_suffix"], - { - "stat": "Average", - "yAxis": "right", - }, - ], - ], - "yAxis": { - "right": { - "label": "Seconds", - "showUnits": False, - } - }, - }, - } - ] - if "web_alb_suffix" in o - else [] - ), - *( - [ - { - "type": "metric", - "x": 12, - "y": 0, - "width": 12, - "height": 6, - "properties": { - "title": "Versioncheck ALB - Requests and Errors", - "region": region, - "period": dash_period, - "metrics": [ - [ - "AWS/ApplicationELB", - "RequestCount", - "LoadBalancer", - o["versioncheck_alb_suffix"], - {"stat": "Sum"}, - ], - [ - "AWS/ApplicationELB", - "HTTPCode_ELB_5XX_Count", - "LoadBalancer", - o["versioncheck_alb_suffix"], - {"stat": "Sum"}, - ], - [ - "AWS/ApplicationELB", - "HTTPCode_Target_5XX_Count", - "LoadBalancer", - o["versioncheck_alb_suffix"], - {"stat": "Sum"}, - ], - [ - "AWS/ApplicationELB", - "TargetResponseTime", - "LoadBalancer", - o["versioncheck_alb_suffix"], - { - "stat": "Average", - "yAxis": "right", - }, - ], - ], - "yAxis": { - "right": { - "label": "Seconds", - "showUnits": False, - } - }, - }, - } - ] - if "versioncheck_alb_suffix" in o - else [] - ), - *( - [ - { - "type": "metric", - "x": 0, - "y": 6, - "width": 8, - "height": 6, - "properties": { - "title": "Web ECS - CPU and Memory", - "region": region, - "period": dash_period, - "metrics": [ - [ - "AWS/ECS", - "CPUUtilization", - "ClusterName", - o["web_cluster"], - "ServiceName", - o["web_svc_name"], - {"stat": "Average"}, - ], - [ - "AWS/ECS", - "MemoryUtilization", - "ClusterName", - o["web_cluster"], - "ServiceName", - o["web_svc_name"], - {"stat": "Average"}, - ], - ], - }, - } - ] - if "web_cluster" in o and "web_svc_name" in o - else [] - ), - *( - [ - { - "type": "metric", - "x": 8, - "y": 6, - "width": 8, - "height": 6, - "properties": { - "title": "Worker ECS - CPU and Memory", - "region": region, - "period": dash_period, - "metrics": [ - [ - "AWS/ECS", - "CPUUtilization", - "ClusterName", - o["worker_cluster"], - "ServiceName", - o["worker_svc_name"], - {"stat": "Average"}, - ], - [ - "AWS/ECS", - "MemoryUtilization", - "ClusterName", - o["worker_cluster"], - "ServiceName", - o["worker_svc_name"], - {"stat": "Average"}, - ], - ], - }, - } - ] - if "worker_cluster" in o and "worker_svc_name" in o - else [] - ), - *( - [ - { - "type": "metric", - "x": 16, - "y": 6, - "width": 8, - "height": 6, - "properties": { - "title": "Versioncheck ECS - CPU and Memory", - "region": region, - "period": dash_period, - "metrics": [ - [ - "AWS/ECS", - "CPUUtilization", - "ClusterName", - o["versioncheck_cluster"], - "ServiceName", - o["versioncheck_svc_name"], - {"stat": "Average"}, - ], - [ - "AWS/ECS", - "MemoryUtilization", - "ClusterName", - o["versioncheck_cluster"], - "ServiceName", - o["versioncheck_svc_name"], - {"stat": "Average"}, - ], - ], - }, - } - ] - if "versioncheck_cluster" in o - and "versioncheck_svc_name" in o - else [] - ), - *( - [ - { - "type": "metric", - "x": 0, - "y": 12, - "width": 12, - "height": 6, - "properties": { - "title": "Amazon MQ - Queue Health", - "region": region, - "period": dash_period, - "metrics": [ - [ - "AWS/AmazonMQ", - "MessageReadyCount", - "Broker", - o["mq_broker_name"], - "VirtualHost", - mq_vhost_dash, - "Queue", - mq_queue, - {"stat": "Average"}, - ], - [ - "AWS/AmazonMQ", - "MessageUnacknowledgedCount", - "Broker", - o["mq_broker_name"], - "VirtualHost", - mq_vhost_dash, - "Queue", - mq_queue, - {"stat": "Average"}, - ], - [ - "AWS/AmazonMQ", - "ConsumerCount", - "Broker", - o["mq_broker_name"], - "VirtualHost", - mq_vhost_dash, - "Queue", - mq_queue, - { - "stat": "Minimum", - "yAxis": "right", - }, - ], - ], - "yAxis": { - "right": { - "label": "Consumers", - "showUnits": False, - } - }, - }, - }, - { - "type": "metric", - "x": 12, - "y": 12, - "width": 12, - "height": 6, - "properties": { - "title": "Amazon MQ - Broker Resources", - "region": region, - "period": dash_period, - "metrics": [ - [ - "AWS/AmazonMQ", - "SystemCpuUtilization", - "Broker", - o["mq_broker_name"], - {"stat": "Average"}, - ], - [ - "AWS/AmazonMQ", - "RabbitMQMemUsed", - "Broker", - o["mq_broker_name"], - { - "stat": "Average", - "yAxis": "right", - }, - ], - ], - "yAxis": { - "right": { - "label": "Bytes", - "showUnits": False, - } - }, - }, - }, - ] - if "mq_broker_name" in o - else [] - ), - *( - [ - { - "type": "metric", - "x": 0, - "y": 18, - "width": 12, - "height": 6, - "properties": { - "title": "Redis - Memory and Evictions", - "region": region, - "period": dash_period, - "metrics": [ - [ - "AWS/ElastiCache", - "DatabaseMemoryUsagePercentage", - "CacheClusterId", - o["redis_cluster_id"], - {"stat": "Average"}, - ], - [ - "AWS/ElastiCache", - "Evictions", - "CacheClusterId", - o["redis_cluster_id"], - {"stat": "Sum", "yAxis": "right"}, - ], - ], - "yAxis": { - "right": { - "label": "Count", - "showUnits": False, - } - }, - }, - }, - { - "type": "metric", - "x": 12, - "y": 18, - "width": 12, - "height": 6, - "properties": { - "title": "Redis - CPU and Connections", - "region": region, - "period": dash_period, - "metrics": [ - [ - "AWS/ElastiCache", - "EngineCPUUtilization", - "CacheClusterId", - o["redis_cluster_id"], - {"stat": "Average"}, - ], - [ - "AWS/ElastiCache", - "CPUUtilization", - "CacheClusterId", - o["redis_cluster_id"], - {"stat": "Average"}, - ], - [ - "AWS/ElastiCache", - "CurrConnections", - "CacheClusterId", - o["redis_cluster_id"], - { - "stat": "Average", - "yAxis": "right", - }, - ], - ], - "yAxis": { - "right": { - "label": "Connections", - "showUnits": False, - } - }, - }, - }, - ] - if "redis_cluster_id" in o - else [] - ), - ], - } + lambda o: _build_dashboard_body( + o, region, dash_period, mq_queue, mq_vhost_dash ) ) From 1c797f462ba2223af2b1d0134792c8ad95100df5 Mon Sep 17 00:00:00 2001 From: Jonathan Alvarez Delgado Date: Wed, 29 Apr 2026 05:12:05 +0200 Subject: [PATCH 4/4] fix(pulumi): use member_clusters and per-service min_tasks; add availability widgets --- infra/pulumi/__main__.py | 203 ++++++++++++++++++++++++++++++++------- 1 file changed, 167 insertions(+), 36 deletions(-) diff --git a/infra/pulumi/__main__.py b/infra/pulumi/__main__.py index 03bc04653a1f..dfea9b89c460 100755 --- a/infra/pulumi/__main__.py +++ b/infra/pulumi/__main__.py @@ -47,7 +47,12 @@ def _alb_requests_widget(o, svc_name, region, period, x, y): - suffix = o[f"{svc_name}_alb_suffix"] + # Target side metrics use LoadBalancer, TargetGroup to match the alarm + # dimensions; ALB side metrics keep LoadBalancer only because that is + # the dimension AWS publishes them under + lb = o[f"{svc_name}_alb_suffix"] + tg = o.get(f"{svc_name}_tg_suffix") + target_dims = ["LoadBalancer", lb] + (["TargetGroup", tg] if tg else []) return { "type": "metric", "x": x, @@ -63,28 +68,26 @@ def _alb_requests_widget(o, svc_name, region, period, x, y): "AWS/ApplicationELB", "RequestCount", "LoadBalancer", - suffix, + lb, {"stat": "Sum"}, ], [ "AWS/ApplicationELB", "HTTPCode_ELB_5XX_Count", "LoadBalancer", - suffix, + lb, {"stat": "Sum"}, ], [ "AWS/ApplicationELB", "HTTPCode_Target_5XX_Count", - "LoadBalancer", - suffix, + *target_dims, {"stat": "Sum"}, ], [ "AWS/ApplicationELB", "TargetResponseTime", - "LoadBalancer", - suffix, + *target_dims, {"stat": "Average", "yAxis": "right"}, ], ], @@ -130,14 +133,14 @@ def _ecs_resources_widget(o, svc_name, region, period, x, y, width=8): } -def _mq_widgets(o, region, period, queue, vhost): +def _mq_widgets(o, region, period, queue, vhost, y): broker = o["mq_broker_name"] queue_dims = ["Broker", broker, "VirtualHost", vhost, "Queue", queue] return [ { "type": "metric", "x": 0, - "y": 12, + "y": y, "width": 12, "height": 6, "properties": { @@ -170,7 +173,7 @@ def _mq_widgets(o, region, period, queue, vhost): { "type": "metric", "x": 12, - "y": 12, + "y": y, "width": 12, "height": 6, "properties": { @@ -199,13 +202,13 @@ def _mq_widgets(o, region, period, queue, vhost): ] -def _redis_widgets(o, region, period): +def _redis_widgets(o, region, period, y): cluster_id = o["redis_cluster_id"] return [ { "type": "metric", "x": 0, - "y": 18, + "y": y, "width": 12, "height": 6, "properties": { @@ -234,7 +237,7 @@ def _redis_widgets(o, region, period): { "type": "metric", "x": 12, - "y": 18, + "y": y, "width": 12, "height": 6, "properties": { @@ -270,26 +273,140 @@ def _redis_widgets(o, region, period): ] +def _availability_widgets(o, region, period, mq_queue, mq_vhost, y): + # "Is it alive?" panels rendered at the top of the dashboard so positive + # availability reads first. Each sub-widget is conditional on the keys + # for its source being present in `o` + widgets = [] + + alb_metrics = [] + for svc_name in ("web", "versioncheck"): + lb = o.get(f"{svc_name}_alb_suffix") + tg = o.get(f"{svc_name}_tg_suffix") + if lb and tg: + alb_metrics.append( + [ + "AWS/ApplicationELB", + "HealthyHostCount", + "TargetGroup", + tg, + "LoadBalancer", + lb, + {"stat": "Minimum", "label": f"{svc_name} healthy"}, + ] + ) + if alb_metrics: + widgets.append( + { + "type": "metric", + "x": 0, + "y": y, + "width": 8, + "height": 6, + "properties": { + "title": "ALB - Healthy Hosts", + "region": region, + "period": period, + "stat": "Minimum", + "metrics": alb_metrics, + }, + } + ) + + ecs_metrics = [] + for svc_name in ("web", "worker", "versioncheck"): + cluster = o.get(f"{svc_name}_cluster") + service = o.get(f"{svc_name}_svc_name") + if cluster and service: + ecs_metrics.append( + [ + "ECS/ContainerInsights", + "RunningTaskCount", + "ClusterName", + cluster, + "ServiceName", + service, + {"stat": "Minimum", "label": f"{svc_name} running"}, + ] + ) + if ecs_metrics: + widgets.append( + { + "type": "metric", + "x": 8, + "y": y, + "width": 8, + "height": 6, + "properties": { + "title": "ECS - Running Tasks", + "region": region, + "period": period, + "stat": "Minimum", + "metrics": ecs_metrics, + }, + } + ) + + if "mq_broker_name" in o: + broker = o["mq_broker_name"] + widgets.append( + { + "type": "metric", + "x": 16, + "y": y, + "width": 8, + "height": 6, + "properties": { + "title": f"MQ - Consumers on '{mq_queue}'", + "region": region, + "period": period, + "stat": "Minimum", + "metrics": [ + [ + "AWS/AmazonMQ", + "ConsumerCount", + "Broker", + broker, + "VirtualHost", + mq_vhost, + "Queue", + mq_queue, + {"stat": "Minimum"}, + ], + ], + }, + } + ) + + return widgets + + def _build_dashboard_body(o, region, period, mq_queue, mq_vhost): widgets = [] + # Row 0: availability - is it alive? + widgets.extend(_availability_widgets(o, region, period, mq_queue, mq_vhost, y=0)) + # Row 1: ALB requests/errors per service if "web_alb_suffix" in o: - widgets.append(_alb_requests_widget(o, "web", region, period, x=0, y=0)) + widgets.append(_alb_requests_widget(o, "web", region, period, x=0, y=6)) if "versioncheck_alb_suffix" in o: widgets.append( - _alb_requests_widget(o, "versioncheck", region, period, x=12, y=0) + _alb_requests_widget(o, "versioncheck", region, period, x=12, y=6) ) + # Row 2: ECS CPU/memory per service if "web_cluster" in o and "web_svc_name" in o: - widgets.append(_ecs_resources_widget(o, "web", region, period, x=0, y=6)) + widgets.append(_ecs_resources_widget(o, "web", region, period, x=0, y=12)) if "worker_cluster" in o and "worker_svc_name" in o: - widgets.append(_ecs_resources_widget(o, "worker", region, period, x=8, y=6)) + widgets.append(_ecs_resources_widget(o, "worker", region, period, x=8, y=12)) if "versioncheck_cluster" in o and "versioncheck_svc_name" in o: widgets.append( - _ecs_resources_widget(o, "versioncheck", region, period, x=16, y=6) + _ecs_resources_widget(o, "versioncheck", region, period, x=16, y=12) ) + # Row 3: Amazon MQ if "mq_broker_name" in o: - widgets.extend(_mq_widgets(o, region, period, mq_queue, mq_vhost)) + widgets.extend(_mq_widgets(o, region, period, mq_queue, mq_vhost, y=18)) + # Row 4: Redis if "redis_cluster_id" in o: - widgets.extend(_redis_widgets(o, region, period)) + widgets.extend(_redis_widgets(o, region, period, y=24)) return json.dumps({"widgets": widgets}) @@ -1144,7 +1261,7 @@ def main(): ) # ========================================================================= - # Monitoring and Alarms (prod-gating baseline) + # Monitoring and Alarms (stage env-gating baseline) # ========================================================================= # Phase 1 observability: SNS notification path, CloudWatch alarms for # ALB/TG/ECS/MQ/Redis, and one operational dashboard @@ -1405,7 +1522,8 @@ def main(): ecs_mem_threshold = ecs_cfg.get("memory_threshold", 80) ecs_period = ecs_cfg.get("period", 300) ecs_eval_periods = ecs_cfg.get("evaluation_periods", 2) - ecs_min_tasks = ecs_cfg.get("min_tasks", 1) + ecs_min_tasks_default = ecs_cfg.get("min_tasks", 1) + ecs_min_tasks_per_svc = ecs_cfg.get("min_tasks_per_service", {}) for svc_name, fargate_svc in fargate_services.items(): ecs_service = fargate_svc.resources.get("service") @@ -1470,10 +1588,19 @@ def main(): opts=pulumi.ResourceOptions(depends_on=[alarm_topic, ecs_service]), ) - # Container Insights publishes RunningTaskCount per service in the - # ECS/ContainerInsights namespace. Operators draining a service - # intentionally should override `min_tasks` per service in config - # or temporarily disable this alarm + # RunningTaskCount lives in the ECS/ContainerInsights namespace, + # which is only populated when Container Insights is enabled on + # the cluster. tb_pulumi.fargate enables it via the + # `enable_container_insights: true` flag in config.stage.yaml. If + # that ever flips to false, this alarm goes immediately to ALARM + # because of the breaching missing-data treatment below - that + # surfaces the misconfiguration loudly and not silently + # disabling availability monitoring + # + # Operators intentionally draining a service should override + # `min_tasks_per_service` in config (e.g., `worker: 0`) or + # temporarily disable this alarm + min_tasks = ecs_min_tasks_per_svc.get(svc_name, ecs_min_tasks_default) aws.cloudwatch.MetricAlarm( f"{project.name_prefix}-{svc_name}-running-tasks", name=f"{project.name_prefix}-{svc_name}-running-tasks", @@ -1487,14 +1614,14 @@ def main(): metric_name="RunningTaskCount", namespace="ECS/ContainerInsights", statistic="Minimum", - threshold=ecs_min_tasks, + threshold=min_tasks, period=ecs_period, evaluation_periods=ecs_eval_periods, # Container Insights stops emitting when a service is fully # drained; that is exactly the failure we want to catch treat_missing_data="breaching", alarm_description=( - f"Running task count below {ecs_min_tasks} on {svc_name}. " + f"Running task count below {min_tasks} on {svc_name}. " "Check: deployment status, service events for stop " "reasons, scheduled actions, task health" ), @@ -1646,13 +1773,14 @@ def main(): redis_eval_periods = redis_cfg.get("evaluation_periods", 2) replication_group = redis_cluster.resources["replication_group"] - # ElastiCache publishes per-node metrics under the cache cluster ID, - # which for a single-node replication group is `-001`. Verify - # at first deploy by reading one CloudWatch datapoint for the - # alarms below; if the dimension value does not match an emitted - # series, switch to `replication_group.member_clusters[0]` (a list - # output that holds the actual cache cluster IDs) - cache_cluster_id = replication_group.id.apply(lambda rg_id: f"{rg_id}-001") + # ElastiCache publishes per-node metrics under the cache cluster ID. + # Use the provider's actual member_clusters output rather than + # reconstructing the AWS naming convention (`-001`); for our + # single-node replication group this resolves to the same value but + # is robust against multi-node setups and AWS naming changes + cache_cluster_id = replication_group.member_clusters.apply( + lambda clusters: clusters[0] + ) aws.cloudwatch.MetricAlarm( f"{project.name_prefix}-redis-memory", @@ -1794,8 +1922,11 @@ def main(): svc_alb = fargate_svc.resources.get("fargate_service_alb") if svc_alb: alb = svc_alb.resources["albs"].get(svc_name) + tg = svc_alb.resources["target_groups"].get(svc_name) if alb: dashboard_outputs[f"{svc_name}_alb_suffix"] = alb.arn_suffix + if tg: + dashboard_outputs[f"{svc_name}_tg_suffix"] = tg.arn_suffix svc_res = fargate_svc.resources.get("service") cluster_res = fargate_svc.resources.get("cluster") if svc_res: @@ -1824,7 +1955,7 @@ def main(): if redis_cluster: dashboard_outputs["redis_cluster_id"] = redis_cluster.resources[ "replication_group" - ].id.apply(lambda rg_id: f"{rg_id}-001") + ].member_clusters.apply(lambda clusters: clusters[0]) mq_queue = alarm_cfg.get("mq", {}).get("queue_name", "olympia") mq_vhost_dash = alarm_cfg.get("mq", {}).get("virtual_host", "/")