From e726563d25c5fd3570b965bbbc85bb4321482bf5 Mon Sep 17 00:00:00 2001
From: Jonathan Alvarez Delgado <jonathan.adl@proton.me>
Date: Thu, 16 Apr 2026 18:08:35 +0200
Subject: [PATCH 1/4] feat(pulumi): add monitoring baseline with explicit CW
 alarms

---
 infra/pulumi/__main__.py       | 1006 ++++++++++++++++++++++++++++++++
 infra/pulumi/config.stage.yaml |   46 ++
 2 files changed, 1052 insertions(+)

diff --git a/infra/pulumi/__main__.py b/infra/pulumi/__main__.py
index 9baacf48f1f3..aaa8315145a8 100755
--- a/infra/pulumi/__main__.py
+++ b/infra/pulumi/__main__.py
@@ -768,6 +768,7 @@ def main():
     # Dedicated stage broker replacing the production EC2 RabbitMQ that
     # atn/stage/celery_broker previously pointed to (issue #375)
     mq_config = resources.get("aws:mq:RabbitMQBroker", {})
+    mq_broker = None
 
     if mq_config and private_subnets and vpc_resource:
         mq_creds_secret_name = mq_config.get("credentials_secret_name")
@@ -885,6 +886,1011 @@ def main():
             ),
         )
 
+    # =========================================================================
+    # Monitoring and Alarms (prod-gating baseline)
+    # =========================================================================
+    # Phase 1 observability: SNS notification path, CloudWatch alarms for
+    # ALB/TG/ECS/MQ/Redis, and one operational dashboard
+    #
+    # All alarms are written explicitly (not via CloudWatchMonitoringGroup)
+    # for a single SNS topic, full control over alarm descriptions, and
+    # correct metric names (upstream tb_pulumi has a target_5xx metric bug)
+    #
+    # Thresholds live in config.stage.yaml under resources.monitoring.alarms
+    monitoring_cfg = resources.get("monitoring", {})
+    alarm_cfg = monitoring_cfg.get("alarms", {})
+
+    if monitoring_cfg and fargate_services:
+        notify_secret_name = monitoring_cfg.get("notify_emails_secret_name")
+        notify_emails = []
+        if notify_secret_name:
+            notify_emails_raw = aws.secretsmanager.get_secret_version(
+                secret_id=notify_secret_name,
+            )
+            notify_emails = [
+                e.strip()
+                for e in notify_emails_raw.secret_string.split(",")
+                if e.strip()
+            ]
+
+        # -----------------------------------------------------------------
+        # SNS topic + email subscriptions
+        # -----------------------------------------------------------------
+        alarm_topic = aws.sns.Topic(
+            f"{project.name_prefix}-alarm-topic",
+            name=f"{project.name_prefix}-alarms",
+            tags={
+                **project.common_tags,
+                "Name": f"{project.name_prefix}-alarms",
+            },
+        )
+
+        for idx, email in enumerate(notify_emails):
+            aws.sns.TopicSubscription(
+                f"{project.name_prefix}-alarm-sub-{idx}",
+                protocol="email",
+                endpoint=email,
+                topic=alarm_topic.arn,
+                opts=pulumi.ResourceOptions(depends_on=[alarm_topic]),
+            )
+
+        # -----------------------------------------------------------------
+        # ALB alarms (web, versioncheck)
+        # -----------------------------------------------------------------
+        alb_cfg = alarm_cfg.get("alb", {})
+        alb_error_threshold = alb_cfg.get("error_threshold", 10)
+        alb_error_period = alb_cfg.get("error_period", 60)
+        alb_rt_threshold = alb_cfg.get("response_time_threshold", 1)
+        alb_rt_period = alb_cfg.get("response_time_period", 60)
+        alb_eval_periods = alb_cfg.get("evaluation_periods", 2)
+
+        for svc_name in ["web", "versioncheck"]:
+            fargate_svc = fargate_services.get(svc_name)
+            if not fargate_svc:
+                continue
+            svc_alb = fargate_svc.resources.get("fargate_service_alb")
+            if not svc_alb:
+                continue
+            alb = svc_alb.resources["albs"].get(svc_name)
+            if not alb:
+                continue
+
+            lb_suffix = alb.arn_suffix
+
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-{svc_name}-alb-5xx",
+                name=f"{project.name_prefix}-{svc_name}-alb-5xx",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="GreaterThanOrEqualToThreshold",
+                dimensions={"LoadBalancer": lb_suffix},
+                metric_name="HTTPCode_ELB_5XX_Count",
+                namespace="AWS/ApplicationELB",
+                statistic="Sum",
+                threshold=alb_error_threshold,
+                period=alb_error_period,
+                evaluation_periods=alb_eval_periods,
+                treat_missing_data="notBreaching",
+                alarm_description=(
+                    f"Elevated 5xx errors on the {svc_name} ALB. "
+                    "Check: ECS task health in console, then "
+                    "application logs in CloudWatch for stack traces."
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(depends_on=[alarm_topic, alb]),
+            )
+
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-{svc_name}-target-5xx",
+                name=f"{project.name_prefix}-{svc_name}-target-5xx",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="GreaterThanOrEqualToThreshold",
+                dimensions={"LoadBalancer": lb_suffix},
+                metric_name="HTTPCode_Target_5XX_Count",
+                namespace="AWS/ApplicationELB",
+                statistic="Sum",
+                threshold=alb_error_threshold,
+                period=alb_error_period,
+                evaluation_periods=alb_eval_periods,
+                treat_missing_data="notBreaching",
+                alarm_description=(
+                    f"Elevated 5xx errors from {svc_name} application targets. "
+                    "Check: application logs for exceptions, database "
+                    "connectivity, and upstream dependency health."
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(depends_on=[alarm_topic, alb]),
+            )
+
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-{svc_name}-response-time",
+                name=f"{project.name_prefix}-{svc_name}-response-time",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="GreaterThanOrEqualToThreshold",
+                dimensions={"LoadBalancer": lb_suffix},
+                metric_name="TargetResponseTime",
+                namespace="AWS/ApplicationELB",
+                statistic="Average",
+                threshold=alb_rt_threshold,
+                period=alb_rt_period,
+                evaluation_periods=alb_eval_periods,
+                treat_missing_data="notBreaching",
+                alarm_description=(
+                    f"Average response time above {alb_rt_threshold}s on {svc_name}. "
+                    "Check: is traffic elevated? Are database queries slow? "
+                    "Is Memcached reachable?"
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(depends_on=[alarm_topic, alb]),
+            )
+
+        # -----------------------------------------------------------------
+        # Target group alarms (web, versioncheck)
+        # -----------------------------------------------------------------
+        tg_cfg = alarm_cfg.get("target_group", {})
+        tg_unhealthy_threshold = tg_cfg.get("unhealthy_threshold", 1)
+        tg_period = tg_cfg.get("period", 60)
+        tg_eval_periods = tg_cfg.get("evaluation_periods", 2)
+
+        for svc_name in ["web", "versioncheck"]:
+            fargate_svc = fargate_services.get(svc_name)
+            if not fargate_svc:
+                continue
+            svc_alb = fargate_svc.resources.get("fargate_service_alb")
+            if not svc_alb:
+                continue
+            alb = svc_alb.resources["albs"].get(svc_name)
+            tg = svc_alb.resources["target_groups"].get(svc_name)
+            if not alb or not tg:
+                continue
+
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-{svc_name}-unhealthy-hosts",
+                name=f"{project.name_prefix}-{svc_name}-unhealthy-hosts",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="GreaterThanOrEqualToThreshold",
+                dimensions={
+                    "TargetGroup": tg.arn_suffix,
+                    "LoadBalancer": alb.arn_suffix,
+                },
+                metric_name="UnHealthyHostCount",
+                namespace="AWS/ApplicationELB",
+                statistic="Average",
+                threshold=tg_unhealthy_threshold,
+                period=tg_period,
+                evaluation_periods=tg_eval_periods,
+                treat_missing_data="notBreaching",
+                alarm_description=(
+                    f"Unhealthy hosts detected in {svc_name} target group. "
+                    "Check: ECS task status, health check endpoint "
+                    "(/services/monitor.json), container logs."
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(depends_on=[alarm_topic, tg]),
+            )
+
+        # -----------------------------------------------------------------
+        # ECS service alarms (web, worker, versioncheck)
+        # -----------------------------------------------------------------
+        ecs_cfg = alarm_cfg.get("ecs", {})
+        ecs_cpu_threshold = ecs_cfg.get("cpu_threshold", 80)
+        ecs_mem_threshold = ecs_cfg.get("memory_threshold", 80)
+        ecs_period = ecs_cfg.get("period", 300)
+        ecs_eval_periods = ecs_cfg.get("evaluation_periods", 2)
+
+        for svc_name, fargate_svc in fargate_services.items():
+            ecs_service = fargate_svc.resources.get("service")
+            ecs_cluster = fargate_svc.resources.get("cluster")
+            if not ecs_service or not ecs_cluster:
+                continue
+
+            cluster_name = ecs_cluster.arn.apply(lambda arn: arn.split("/")[-1])
+            service_name = ecs_service.name
+
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-{svc_name}-ecs-cpu",
+                name=f"{project.name_prefix}-{svc_name}-ecs-cpu",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="GreaterThanOrEqualToThreshold",
+                dimensions={
+                    "ClusterName": cluster_name,
+                    "ServiceName": service_name,
+                },
+                metric_name="CPUUtilization",
+                namespace="AWS/ECS",
+                statistic="Average",
+                threshold=ecs_cpu_threshold,
+                period=ecs_period,
+                evaluation_periods=ecs_eval_periods,
+                treat_missing_data="notBreaching",
+                alarm_description=(
+                    f"CPU utilisation above {ecs_cpu_threshold}% on {svc_name} service. "
+                    "Check: is traffic elevated? Are tasks stuck? "
+                    "Consider scaling if sustained."
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(depends_on=[alarm_topic, ecs_service]),
+            )
+
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-{svc_name}-ecs-memory",
+                name=f"{project.name_prefix}-{svc_name}-ecs-memory",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="GreaterThanOrEqualToThreshold",
+                dimensions={
+                    "ClusterName": cluster_name,
+                    "ServiceName": service_name,
+                },
+                metric_name="MemoryUtilization",
+                namespace="AWS/ECS",
+                statistic="Average",
+                threshold=ecs_mem_threshold,
+                period=ecs_period,
+                evaluation_periods=ecs_eval_periods,
+                treat_missing_data="notBreaching",
+                alarm_description=(
+                    f"Memory utilisation above {ecs_mem_threshold}% on {svc_name} service. "
+                    "Check: application memory leaks, task resource limits, "
+                    "consider scaling."
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(depends_on=[alarm_topic, ecs_service]),
+            )
+
+        # -----------------------------------------------------------------
+        # Amazon MQ alarms
+        # -----------------------------------------------------------------
+        mq_cfg = alarm_cfg.get("mq", {})
+
+        if mq_broker is not None:
+            mq_queue_name = mq_cfg.get("queue_name", "olympia")
+            mq_vhost = mq_cfg.get("virtual_host", "/")
+            mq_msg_threshold = mq_cfg.get("message_ready_threshold", 1000)
+            mq_consumer_alarm_enabled = mq_cfg.get("consumer_alarm_enabled", False)
+            mq_consumer_threshold = mq_cfg.get("consumer_count_threshold", 1)
+            mq_cpu_threshold = mq_cfg.get("cpu_threshold", 80)
+            mq_mem_threshold = mq_cfg.get("memory_bytes_threshold", 512000000)
+            mq_period = mq_cfg.get("period", 300)
+            mq_eval_periods = mq_cfg.get("evaluation_periods", 2)
+
+            broker_id = mq_broker.id
+
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-mq-message-ready",
+                name=f"{project.name_prefix}-mq-message-ready",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="GreaterThanOrEqualToThreshold",
+                dimensions={
+                    "Broker": broker_id,
+                    "VirtualHost": mq_vhost,
+                    "Queue": mq_queue_name,
+                },
+                metric_name="MessageReadyCount",
+                namespace="AWS/AmazonMQ",
+                statistic="Average",
+                threshold=mq_msg_threshold,
+                period=mq_period,
+                evaluation_periods=mq_eval_periods,
+                treat_missing_data="notBreaching",
+                alarm_description=(
+                    f"Queue '{mq_queue_name}' has over {mq_msg_threshold} "
+                    "ready messages. Check: is the worker consuming? "
+                    "Are tasks backing up? Check worker logs."
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(depends_on=[alarm_topic, mq_broker]),
+            )
+
+            if mq_consumer_alarm_enabled:
+                aws.cloudwatch.MetricAlarm(
+                    f"{project.name_prefix}-mq-consumer-count",
+                    name=f"{project.name_prefix}-mq-consumer-count",
+                    alarm_actions=[alarm_topic.arn],
+                    ok_actions=[alarm_topic.arn],
+                    comparison_operator="LessThanThreshold",
+                    dimensions={
+                        "Broker": broker_id,
+                        "VirtualHost": mq_vhost,
+                        "Queue": mq_queue_name,
+                    },
+                    metric_name="ConsumerCount",
+                    namespace="AWS/AmazonMQ",
+                    statistic="Minimum",
+                    threshold=mq_consumer_threshold,
+                    period=mq_period,
+                    evaluation_periods=mq_eval_periods,
+                    treat_missing_data="breaching",
+                    alarm_description=(
+                        f"No consumers connected to the '{mq_queue_name}' queue. "
+                        "Check: is the worker service running? "
+                        "Check worker logs for connection errors."
+                    ),
+                    tags=project.common_tags,
+                    opts=pulumi.ResourceOptions(depends_on=[alarm_topic, mq_broker]),
+                )
+
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-mq-cpu",
+                name=f"{project.name_prefix}-mq-cpu",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="GreaterThanOrEqualToThreshold",
+                dimensions={"Broker": broker_id},
+                metric_name="SystemCpuUtilization",
+                namespace="AWS/AmazonMQ",
+                statistic="Average",
+                threshold=mq_cpu_threshold,
+                period=mq_period,
+                evaluation_periods=mq_eval_periods,
+                treat_missing_data="notBreaching",
+                alarm_description=(
+                    f"Broker CPU above {mq_cpu_threshold}%. Check: "
+                    "queue depth, message throughput, consider "
+                    "upgrading instance type if sustained."
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(depends_on=[alarm_topic, mq_broker]),
+            )
+
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-mq-memory",
+                name=f"{project.name_prefix}-mq-memory",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="GreaterThanOrEqualToThreshold",
+                dimensions={"Broker": broker_id},
+                metric_name="RabbitMQMemUsed",
+                namespace="AWS/AmazonMQ",
+                statistic="Average",
+                threshold=mq_mem_threshold,
+                period=mq_period,
+                evaluation_periods=mq_eval_periods,
+                treat_missing_data="notBreaching",
+                alarm_description=(
+                    f"Broker memory above {mq_mem_threshold} bytes. "
+                    "Check: queue depth and message sizes, consider "
+                    "purging stale queues or upgrading instance."
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(depends_on=[alarm_topic, mq_broker]),
+            )
+
+        # -----------------------------------------------------------------
+        # Redis alarms
+        # -----------------------------------------------------------------
+        redis_cfg = alarm_cfg.get("redis", {})
+        redis_cluster = elasticache_clusters.get("redis")
+
+        if redis_cluster:
+            redis_mem_threshold = redis_cfg.get("memory_pct_threshold", 80)
+            redis_eviction_threshold = redis_cfg.get("eviction_threshold", 100)
+            redis_cpu_threshold = redis_cfg.get("cpu_threshold", 80)
+            redis_host_cpu_threshold = redis_cfg.get("host_cpu_threshold", 90)
+            redis_conn_threshold = redis_cfg.get("connection_threshold", 500)
+            redis_period = redis_cfg.get("period", 300)
+            redis_eval_periods = redis_cfg.get("evaluation_periods", 2)
+
+            replication_group = redis_cluster.resources["replication_group"]
+            cache_cluster_id = replication_group.id.apply(lambda rg_id: f"{rg_id}-001")
+
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-redis-memory",
+                name=f"{project.name_prefix}-redis-memory",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="GreaterThanOrEqualToThreshold",
+                dimensions={"CacheClusterId": cache_cluster_id},
+                metric_name="DatabaseMemoryUsagePercentage",
+                namespace="AWS/ElastiCache",
+                statistic="Average",
+                threshold=redis_mem_threshold,
+                period=redis_period,
+                evaluation_periods=redis_eval_periods,
+                treat_missing_data="notBreaching",
+                alarm_description=(
+                    f"Redis memory usage above {redis_mem_threshold}%. "
+                    "Check: eviction count, key count growth, "
+                    "potential memory leak in application cache usage."
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(
+                    depends_on=[alarm_topic, replication_group]
+                ),
+            )
+
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-redis-evictions",
+                name=f"{project.name_prefix}-redis-evictions",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="GreaterThanOrEqualToThreshold",
+                dimensions={"CacheClusterId": cache_cluster_id},
+                metric_name="Evictions",
+                namespace="AWS/ElastiCache",
+                statistic="Sum",
+                threshold=redis_eviction_threshold,
+                period=redis_period,
+                evaluation_periods=redis_eval_periods,
+                treat_missing_data="notBreaching",
+                alarm_description=(
+                    f"Redis evictions above {redis_eviction_threshold} "
+                    "per period. Check: memory usage, maxmemory-policy, "
+                    "whether the application is over-caching."
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(
+                    depends_on=[alarm_topic, replication_group]
+                ),
+            )
+
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-redis-cpu",
+                name=f"{project.name_prefix}-redis-cpu",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="GreaterThanOrEqualToThreshold",
+                dimensions={"CacheClusterId": cache_cluster_id},
+                metric_name="EngineCPUUtilization",
+                namespace="AWS/ElastiCache",
+                statistic="Average",
+                threshold=redis_cpu_threshold,
+                period=redis_period,
+                evaluation_periods=redis_eval_periods,
+                treat_missing_data="notBreaching",
+                alarm_description=(
+                    f"Redis engine CPU above {redis_cpu_threshold}%. "
+                    "Check: command complexity (KEYS, SORT), "
+                    "connection count, consider node upgrade."
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(
+                    depends_on=[alarm_topic, replication_group]
+                ),
+            )
+
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-redis-connections",
+                name=f"{project.name_prefix}-redis-connections",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="GreaterThanOrEqualToThreshold",
+                dimensions={"CacheClusterId": cache_cluster_id},
+                metric_name="CurrConnections",
+                namespace="AWS/ElastiCache",
+                statistic="Average",
+                threshold=redis_conn_threshold,
+                period=redis_period,
+                evaluation_periods=redis_eval_periods,
+                treat_missing_data="notBreaching",
+                alarm_description=(
+                    f"Redis connections above {redis_conn_threshold}. "
+                    "Check: connection pool settings, task/service "
+                    "count, potential connection leaks."
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(
+                    depends_on=[alarm_topic, replication_group]
+                ),
+            )
+
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-redis-host-cpu",
+                name=f"{project.name_prefix}-redis-host-cpu",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="GreaterThanOrEqualToThreshold",
+                dimensions={"CacheClusterId": cache_cluster_id},
+                metric_name="CPUUtilization",
+                namespace="AWS/ElastiCache",
+                statistic="Average",
+                threshold=redis_host_cpu_threshold,
+                period=redis_period,
+                evaluation_periods=redis_eval_periods,
+                treat_missing_data="notBreaching",
+                alarm_description=(
+                    f"Redis host CPU above {redis_host_cpu_threshold}%. "
+                    "This monitors the underlying host, not just the Redis "
+                    "engine. On nodes with <= 2 vCPUs, EngineCPUUtilization "
+                    "alone can miss host overload. Check: background processes, "
+                    "node type, consider upgrading"
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(
+                    depends_on=[alarm_topic, replication_group]
+                ),
+            )
+
+        # -----------------------------------------------------------------
+        # CloudWatch Dashboard
+        # -----------------------------------------------------------------
+        dash_cfg = monitoring_cfg.get("dashboard", {})
+        dash_period = dash_cfg.get("period", 300)
+
+        dashboard_outputs = {}
+        for svc_name in ["web", "versioncheck"]:
+            fargate_svc = fargate_services.get(svc_name)
+            if fargate_svc:
+                svc_alb = fargate_svc.resources.get("fargate_service_alb")
+                if svc_alb:
+                    alb = svc_alb.resources["albs"].get(svc_name)
+                    if alb:
+                        dashboard_outputs[f"{svc_name}_alb_suffix"] = alb.arn_suffix
+                svc_res = fargate_svc.resources.get("service")
+                cluster_res = fargate_svc.resources.get("cluster")
+                if svc_res:
+                    dashboard_outputs[f"{svc_name}_svc_name"] = svc_res.name
+                if cluster_res:
+                    dashboard_outputs[f"{svc_name}_cluster"] = cluster_res.arn.apply(
+                        lambda arn: arn.split("/")[-1]
+                    )
+
+        worker_svc = fargate_services.get("worker")
+        if worker_svc:
+            svc_res = worker_svc.resources.get("service")
+            cluster_res = worker_svc.resources.get("cluster")
+            if svc_res:
+                dashboard_outputs["worker_svc_name"] = svc_res.name
+            if cluster_res:
+                dashboard_outputs["worker_cluster"] = cluster_res.arn.apply(
+                    lambda arn: arn.split("/")[-1]
+                )
+
+        if mq_broker is not None:
+            dashboard_outputs["mq_broker_id"] = mq_broker.id
+
+        if redis_cluster:
+            dashboard_outputs["redis_cluster_id"] = redis_cluster.resources[
+                "replication_group"
+            ].id.apply(lambda rg_id: f"{rg_id}-001")
+
+        mq_queue = alarm_cfg.get("mq", {}).get("queue_name", "olympia")
+        mq_vhost_dash = alarm_cfg.get("mq", {}).get("virtual_host", "/")
+        region = project.aws_region
+
+        if dashboard_outputs:
+            dashboard_body = pulumi.Output.all(**dashboard_outputs).apply(
+                lambda o: json.dumps(
+                    {
+                        "widgets": [
+                            *(
+                                [
+                                    {
+                                        "type": "metric",
+                                        "x": 0,
+                                        "y": 0,
+                                        "width": 12,
+                                        "height": 6,
+                                        "properties": {
+                                            "title": "Web ALB - Requests and Errors",
+                                            "region": region,
+                                            "period": dash_period,
+                                            "metrics": [
+                                                [
+                                                    "AWS/ApplicationELB",
+                                                    "RequestCount",
+                                                    "LoadBalancer",
+                                                    o["web_alb_suffix"],
+                                                    {"stat": "Sum"},
+                                                ],
+                                                [
+                                                    "AWS/ApplicationELB",
+                                                    "HTTPCode_ELB_5XX_Count",
+                                                    "LoadBalancer",
+                                                    o["web_alb_suffix"],
+                                                    {"stat": "Sum"},
+                                                ],
+                                                [
+                                                    "AWS/ApplicationELB",
+                                                    "HTTPCode_Target_5XX_Count",
+                                                    "LoadBalancer",
+                                                    o["web_alb_suffix"],
+                                                    {"stat": "Sum"},
+                                                ],
+                                                [
+                                                    "AWS/ApplicationELB",
+                                                    "TargetResponseTime",
+                                                    "LoadBalancer",
+                                                    o["web_alb_suffix"],
+                                                    {
+                                                        "stat": "Average",
+                                                        "yAxis": "right",
+                                                    },
+                                                ],
+                                            ],
+                                            "yAxis": {
+                                                "right": {
+                                                    "label": "Seconds",
+                                                    "showUnits": False,
+                                                }
+                                            },
+                                        },
+                                    }
+                                ]
+                                if "web_alb_suffix" in o
+                                else []
+                            ),
+                            *(
+                                [
+                                    {
+                                        "type": "metric",
+                                        "x": 12,
+                                        "y": 0,
+                                        "width": 12,
+                                        "height": 6,
+                                        "properties": {
+                                            "title": "Versioncheck ALB - Requests and Errors",
+                                            "region": region,
+                                            "period": dash_period,
+                                            "metrics": [
+                                                [
+                                                    "AWS/ApplicationELB",
+                                                    "RequestCount",
+                                                    "LoadBalancer",
+                                                    o["versioncheck_alb_suffix"],
+                                                    {"stat": "Sum"},
+                                                ],
+                                                [
+                                                    "AWS/ApplicationELB",
+                                                    "HTTPCode_ELB_5XX_Count",
+                                                    "LoadBalancer",
+                                                    o["versioncheck_alb_suffix"],
+                                                    {"stat": "Sum"},
+                                                ],
+                                                [
+                                                    "AWS/ApplicationELB",
+                                                    "HTTPCode_Target_5XX_Count",
+                                                    "LoadBalancer",
+                                                    o["versioncheck_alb_suffix"],
+                                                    {"stat": "Sum"},
+                                                ],
+                                                [
+                                                    "AWS/ApplicationELB",
+                                                    "TargetResponseTime",
+                                                    "LoadBalancer",
+                                                    o["versioncheck_alb_suffix"],
+                                                    {
+                                                        "stat": "Average",
+                                                        "yAxis": "right",
+                                                    },
+                                                ],
+                                            ],
+                                            "yAxis": {
+                                                "right": {
+                                                    "label": "Seconds",
+                                                    "showUnits": False,
+                                                }
+                                            },
+                                        },
+                                    }
+                                ]
+                                if "versioncheck_alb_suffix" in o
+                                else []
+                            ),
+                            *(
+                                [
+                                    {
+                                        "type": "metric",
+                                        "x": 0,
+                                        "y": 6,
+                                        "width": 8,
+                                        "height": 6,
+                                        "properties": {
+                                            "title": "Web ECS - CPU and Memory",
+                                            "region": region,
+                                            "period": dash_period,
+                                            "metrics": [
+                                                [
+                                                    "AWS/ECS",
+                                                    "CPUUtilization",
+                                                    "ClusterName",
+                                                    o["web_cluster"],
+                                                    "ServiceName",
+                                                    o["web_svc_name"],
+                                                    {"stat": "Average"},
+                                                ],
+                                                [
+                                                    "AWS/ECS",
+                                                    "MemoryUtilization",
+                                                    "ClusterName",
+                                                    o["web_cluster"],
+                                                    "ServiceName",
+                                                    o["web_svc_name"],
+                                                    {"stat": "Average"},
+                                                ],
+                                            ],
+                                        },
+                                    }
+                                ]
+                                if "web_cluster" in o and "web_svc_name" in o
+                                else []
+                            ),
+                            *(
+                                [
+                                    {
+                                        "type": "metric",
+                                        "x": 8,
+                                        "y": 6,
+                                        "width": 8,
+                                        "height": 6,
+                                        "properties": {
+                                            "title": "Worker ECS - CPU and Memory",
+                                            "region": region,
+                                            "period": dash_period,
+                                            "metrics": [
+                                                [
+                                                    "AWS/ECS",
+                                                    "CPUUtilization",
+                                                    "ClusterName",
+                                                    o["worker_cluster"],
+                                                    "ServiceName",
+                                                    o["worker_svc_name"],
+                                                    {"stat": "Average"},
+                                                ],
+                                                [
+                                                    "AWS/ECS",
+                                                    "MemoryUtilization",
+                                                    "ClusterName",
+                                                    o["worker_cluster"],
+                                                    "ServiceName",
+                                                    o["worker_svc_name"],
+                                                    {"stat": "Average"},
+                                                ],
+                                            ],
+                                        },
+                                    }
+                                ]
+                                if "worker_cluster" in o and "worker_svc_name" in o
+                                else []
+                            ),
+                            *(
+                                [
+                                    {
+                                        "type": "metric",
+                                        "x": 16,
+                                        "y": 6,
+                                        "width": 8,
+                                        "height": 6,
+                                        "properties": {
+                                            "title": "Versioncheck ECS - CPU and Memory",
+                                            "region": region,
+                                            "period": dash_period,
+                                            "metrics": [
+                                                [
+                                                    "AWS/ECS",
+                                                    "CPUUtilization",
+                                                    "ClusterName",
+                                                    o["versioncheck_cluster"],
+                                                    "ServiceName",
+                                                    o["versioncheck_svc_name"],
+                                                    {"stat": "Average"},
+                                                ],
+                                                [
+                                                    "AWS/ECS",
+                                                    "MemoryUtilization",
+                                                    "ClusterName",
+                                                    o["versioncheck_cluster"],
+                                                    "ServiceName",
+                                                    o["versioncheck_svc_name"],
+                                                    {"stat": "Average"},
+                                                ],
+                                            ],
+                                        },
+                                    }
+                                ]
+                                if "versioncheck_cluster" in o
+                                and "versioncheck_svc_name" in o
+                                else []
+                            ),
+                            *(
+                                [
+                                    {
+                                        "type": "metric",
+                                        "x": 0,
+                                        "y": 12,
+                                        "width": 12,
+                                        "height": 6,
+                                        "properties": {
+                                            "title": "Amazon MQ - Queue Health",
+                                            "region": region,
+                                            "period": dash_period,
+                                            "metrics": [
+                                                [
+                                                    "AWS/AmazonMQ",
+                                                    "MessageReadyCount",
+                                                    "Broker",
+                                                    o["mq_broker_id"],
+                                                    "VirtualHost",
+                                                    mq_vhost_dash,
+                                                    "Queue",
+                                                    mq_queue,
+                                                    {"stat": "Average"},
+                                                ],
+                                                [
+                                                    "AWS/AmazonMQ",
+                                                    "MessageUnacknowledgedCount",
+                                                    "Broker",
+                                                    o["mq_broker_id"],
+                                                    "VirtualHost",
+                                                    mq_vhost_dash,
+                                                    "Queue",
+                                                    mq_queue,
+                                                    {"stat": "Average"},
+                                                ],
+                                                [
+                                                    "AWS/AmazonMQ",
+                                                    "ConsumerCount",
+                                                    "Broker",
+                                                    o["mq_broker_id"],
+                                                    "VirtualHost",
+                                                    mq_vhost_dash,
+                                                    "Queue",
+                                                    mq_queue,
+                                                    {
+                                                        "stat": "Minimum",
+                                                        "yAxis": "right",
+                                                    },
+                                                ],
+                                            ],
+                                            "yAxis": {
+                                                "right": {
+                                                    "label": "Consumers",
+                                                    "showUnits": False,
+                                                }
+                                            },
+                                        },
+                                    },
+                                    {
+                                        "type": "metric",
+                                        "x": 12,
+                                        "y": 12,
+                                        "width": 12,
+                                        "height": 6,
+                                        "properties": {
+                                            "title": "Amazon MQ - Broker Resources",
+                                            "region": region,
+                                            "period": dash_period,
+                                            "metrics": [
+                                                [
+                                                    "AWS/AmazonMQ",
+                                                    "SystemCpuUtilization",
+                                                    "Broker",
+                                                    o["mq_broker_id"],
+                                                    {"stat": "Average"},
+                                                ],
+                                                [
+                                                    "AWS/AmazonMQ",
+                                                    "RabbitMQMemUsed",
+                                                    "Broker",
+                                                    o["mq_broker_id"],
+                                                    {
+                                                        "stat": "Average",
+                                                        "yAxis": "right",
+                                                    },
+                                                ],
+                                            ],
+                                            "yAxis": {
+                                                "right": {
+                                                    "label": "Bytes",
+                                                    "showUnits": False,
+                                                }
+                                            },
+                                        },
+                                    },
+                                ]
+                                if "mq_broker_id" in o
+                                else []
+                            ),
+                            *(
+                                [
+                                    {
+                                        "type": "metric",
+                                        "x": 0,
+                                        "y": 18,
+                                        "width": 12,
+                                        "height": 6,
+                                        "properties": {
+                                            "title": "Redis - Memory and Evictions",
+                                            "region": region,
+                                            "period": dash_period,
+                                            "metrics": [
+                                                [
+                                                    "AWS/ElastiCache",
+                                                    "DatabaseMemoryUsagePercentage",
+                                                    "CacheClusterId",
+                                                    o["redis_cluster_id"],
+                                                    {"stat": "Average"},
+                                                ],
+                                                [
+                                                    "AWS/ElastiCache",
+                                                    "Evictions",
+                                                    "CacheClusterId",
+                                                    o["redis_cluster_id"],
+                                                    {"stat": "Sum", "yAxis": "right"},
+                                                ],
+                                            ],
+                                            "yAxis": {
+                                                "right": {
+                                                    "label": "Count",
+                                                    "showUnits": False,
+                                                }
+                                            },
+                                        },
+                                    },
+                                    {
+                                        "type": "metric",
+                                        "x": 12,
+                                        "y": 18,
+                                        "width": 12,
+                                        "height": 6,
+                                        "properties": {
+                                            "title": "Redis - CPU and Connections",
+                                            "region": region,
+                                            "period": dash_period,
+                                            "metrics": [
+                                                [
+                                                    "AWS/ElastiCache",
+                                                    "EngineCPUUtilization",
+                                                    "CacheClusterId",
+                                                    o["redis_cluster_id"],
+                                                    {"stat": "Average"},
+                                                ],
+                                                [
+                                                    "AWS/ElastiCache",
+                                                    "CPUUtilization",
+                                                    "CacheClusterId",
+                                                    o["redis_cluster_id"],
+                                                    {"stat": "Average"},
+                                                ],
+                                                [
+                                                    "AWS/ElastiCache",
+                                                    "CurrConnections",
+                                                    "CacheClusterId",
+                                                    o["redis_cluster_id"],
+                                                    {
+                                                        "stat": "Average",
+                                                        "yAxis": "right",
+                                                    },
+                                                ],
+                                            ],
+                                            "yAxis": {
+                                                "right": {
+                                                    "label": "Connections",
+                                                    "showUnits": False,
+                                                }
+                                            },
+                                        },
+                                    },
+                                ]
+                                if "redis_cluster_id" in o
+                                else []
+                            ),
+                        ],
+                    }
+                )
+            )
+
+            aws.cloudwatch.Dashboard(
+                f"{project.name_prefix}-dashboard",
+                dashboard_name=f"{project.name_prefix}-health",
+                dashboard_body=dashboard_body,
+                opts=pulumi.ResourceOptions(depends_on=[alarm_topic]),
+            )
+
+        # -----------------------------------------------------------------
+        # Monitoring exports
+        # -----------------------------------------------------------------
+        pulumi.export("monitoring_sns_topic_arn", alarm_topic.arn)
+        pulumi.export(
+            "monitoring_dashboard_name",
+            f"{project.name_prefix}-health",
+        )
+
     # =========================================================================
     # ECS Scheduled Tasks (Cron Jobs)
     # =========================================================================
diff --git a/infra/pulumi/config.stage.yaml b/infra/pulumi/config.stage.yaml
index fa40c65c9171..75c9a6d8ef3a 100644
--- a/infra/pulumi/config.stage.yaml
+++ b/infra/pulumi/config.stage.yaml
@@ -638,6 +638,52 @@ resources:
 #   - efs_filesystem_id -- replaced by Pulumi-managed aws:efs:FileSystem
 #   - mq_admin_password -- replaced by mq_credentials (JSON)
 
+  # =============================================================================
+  # Monitoring and Alarms (prod-gating baseline)
+  # =============================================================================
+  # Phase 1: ALB, target group, ECS service, Amazon MQ, Redis alarms
+  # with a single CloudWatch dashboard for is-stage-healthy? triage
+  # Phase 2 (future): EFS, log metric filters, deployment instability,
+  # external/shared resources (RDS, Memcached, OpenSearch)
+  monitoring:
+    notify_emails_secret_name: atn/stage/monitoring_notify_emails
+    alarms:
+      alb:
+        error_threshold: 10
+        error_period: 60
+        response_time_threshold: 1
+        response_time_period: 60
+        evaluation_periods: 2
+      target_group:
+        unhealthy_threshold: 1
+        period: 60
+        evaluation_periods: 2
+      ecs:
+        cpu_threshold: 80
+        memory_threshold: 80
+        period: 300
+        evaluation_periods: 2
+      mq:
+        message_ready_threshold: 1000
+        consumer_alarm_enabled: false  # worker desired_count is intentionally 0 in current stage posture
+        consumer_count_threshold: 1
+        cpu_threshold: 80
+        memory_bytes_threshold: 512000000  # ~512 MB (mq.t3.micro has 1 GB)
+        period: 300
+        evaluation_periods: 2
+        queue_name: olympia
+        virtual_host: /
+      redis:
+        memory_pct_threshold: 80
+        eviction_threshold: 100
+        cpu_threshold: 80
+        host_cpu_threshold: 90  # cache.t3.small has 2 vCPUs; AWS recommends monitoring host CPUUtilization alongside EngineCPUUtilization on nodes with <= 2 vCPUs
+        connection_threshold: 500
+        period: 300
+        evaluation_periods: 2
+    dashboard:
+      period: 300
+
 # =============================================================================
 # Notes for implementation:
 # =============================================================================

From 56b64b1c04a2fc4b32b420f4c0c7d1f473837ca9 Mon Sep 17 00:00:00 2001
From: Jonathan Alvarez Delgado <jonathan.adl@proton.me>
Date: Wed, 29 Apr 2026 01:33:29 +0200
Subject: [PATCH 2/4] fix(pulumi): correct MQ broker dimension and add
 availability alarms

---
 infra/pulumi/__main__.py | 154 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 137 insertions(+), 17 deletions(-)

diff --git a/infra/pulumi/__main__.py b/infra/pulumi/__main__.py
index aaa8315145a8..bcfb3369d3fa 100755
--- a/infra/pulumi/__main__.py
+++ b/infra/pulumi/__main__.py
@@ -934,6 +934,33 @@ def main():
                 opts=pulumi.ResourceOptions(depends_on=[alarm_topic]),
             )
 
+        # If notifications fail to deliver, every other alarm in this stack is
+        # also silently undelivered. We publish this alarm to the same topic
+        # for CloudWatch-console visibility; for Phase 2 a secondary channel
+        # (SMS, Slack, separate topic) should provide an independent path
+        aws.cloudwatch.MetricAlarm(
+            f"{project.name_prefix}-alarm-topic-delivery-failures",
+            name=f"{project.name_prefix}-alarm-topic-delivery-failures",
+            alarm_actions=[alarm_topic.arn],
+            ok_actions=[alarm_topic.arn],
+            comparison_operator="GreaterThanOrEqualToThreshold",
+            dimensions={"TopicName": alarm_topic.name},
+            metric_name="NumberOfNotificationsFailed",
+            namespace="AWS/SNS",
+            statistic="Sum",
+            threshold=1,
+            period=300,
+            evaluation_periods=1,
+            treat_missing_data="notBreaching",
+            alarm_description=(
+                "One or more alarm notifications failed delivery from the "
+                "stage alarm topic. Check: SNS subscription confirmations, "
+                "recipient email validity, topic policy"
+            ),
+            tags=project.common_tags,
+            opts=pulumi.ResourceOptions(depends_on=[alarm_topic]),
+        )
+
         # -----------------------------------------------------------------
         # ALB alarms (web, versioncheck)
         # -----------------------------------------------------------------
@@ -952,10 +979,12 @@ def main():
             if not svc_alb:
                 continue
             alb = svc_alb.resources["albs"].get(svc_name)
-            if not alb:
+            tg = svc_alb.resources["target_groups"].get(svc_name)
+            if not alb or not tg:
                 continue
 
             lb_suffix = alb.arn_suffix
+            tg_suffix = tg.arn_suffix
 
             aws.cloudwatch.MetricAlarm(
                 f"{project.name_prefix}-{svc_name}-alb-5xx",
@@ -986,7 +1015,10 @@ def main():
                 alarm_actions=[alarm_topic.arn],
                 ok_actions=[alarm_topic.arn],
                 comparison_operator="GreaterThanOrEqualToThreshold",
-                dimensions={"LoadBalancer": lb_suffix},
+                dimensions={
+                    "LoadBalancer": lb_suffix,
+                    "TargetGroup": tg_suffix,
+                },
                 metric_name="HTTPCode_Target_5XX_Count",
                 namespace="AWS/ApplicationELB",
                 statistic="Sum",
@@ -1009,7 +1041,10 @@ def main():
                 alarm_actions=[alarm_topic.arn],
                 ok_actions=[alarm_topic.arn],
                 comparison_operator="GreaterThanOrEqualToThreshold",
-                dimensions={"LoadBalancer": lb_suffix},
+                dimensions={
+                    "LoadBalancer": lb_suffix,
+                    "TargetGroup": tg_suffix,
+                },
                 metric_name="TargetResponseTime",
                 namespace="AWS/ApplicationELB",
                 statistic="Average",
@@ -1062,6 +1097,9 @@ def main():
                 threshold=tg_unhealthy_threshold,
                 period=tg_period,
                 evaluation_periods=tg_eval_periods,
+                # Positive availability is covered by the healthy-hosts alarm
+                # below; here we want elevated unhealthy hosts even when at
+                # least one single healthy host remains
                 treat_missing_data="notBreaching",
                 alarm_description=(
                     f"Unhealthy hosts detected in {svc_name} target group. "
@@ -1072,6 +1110,36 @@ def main():
                 opts=pulumi.ResourceOptions(depends_on=[alarm_topic, tg]),
             )
 
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-{svc_name}-healthy-hosts",
+                name=f"{project.name_prefix}-{svc_name}-healthy-hosts",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="LessThanThreshold",
+                dimensions={
+                    "TargetGroup": tg.arn_suffix,
+                    "LoadBalancer": alb.arn_suffix,
+                },
+                metric_name="HealthyHostCount",
+                namespace="AWS/ApplicationELB",
+                statistic="Minimum",
+                threshold=tg_cfg.get("healthy_threshold", 1),
+                period=tg_period,
+                evaluation_periods=tg_eval_periods,
+                # Missing data on this metric means the target group has no
+                # registered targets -- operationally indistinguishable from
+                # zero healthy hosts and therefore treated as breaching
+                treat_missing_data="breaching",
+                alarm_description=(
+                    f"No healthy hosts in {svc_name} target group. "
+                    "Check: is the ECS service running, is the container "
+                    "health-check responding (/services/monitor.json), is "
+                    "the SG allowing traffic from the ALB?"
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(depends_on=[alarm_topic, tg]),
+            )
+
         # -----------------------------------------------------------------
         # ECS service alarms (web, worker, versioncheck)
         # -----------------------------------------------------------------
@@ -1080,6 +1148,7 @@ def main():
         ecs_mem_threshold = ecs_cfg.get("memory_threshold", 80)
         ecs_period = ecs_cfg.get("period", 300)
         ecs_eval_periods = ecs_cfg.get("evaluation_periods", 2)
+        ecs_min_tasks = ecs_cfg.get("min_tasks", 1)
 
         for svc_name, fargate_svc in fargate_services.items():
             ecs_service = fargate_svc.resources.get("service")
@@ -1132,6 +1201,8 @@ def main():
                 threshold=ecs_mem_threshold,
                 period=ecs_period,
                 evaluation_periods=ecs_eval_periods,
+                # Positive availability is covered by the running-tasks alarm
+                # below; CPU/memory only matter while tasks exist
                 treat_missing_data="notBreaching",
                 alarm_description=(
                     f"Memory utilisation above {ecs_mem_threshold}% on {svc_name} service. "
@@ -1142,6 +1213,38 @@ def main():
                 opts=pulumi.ResourceOptions(depends_on=[alarm_topic, ecs_service]),
             )
 
+            # Container Insights publishes RunningTaskCount per service in the
+            # ECS/ContainerInsights namespace. Operators draining a service
+            # intentionally should override `min_tasks` per service in config
+            # or temporarily disable this alarm
+            aws.cloudwatch.MetricAlarm(
+                f"{project.name_prefix}-{svc_name}-running-tasks",
+                name=f"{project.name_prefix}-{svc_name}-running-tasks",
+                alarm_actions=[alarm_topic.arn],
+                ok_actions=[alarm_topic.arn],
+                comparison_operator="LessThanThreshold",
+                dimensions={
+                    "ClusterName": cluster_name,
+                    "ServiceName": service_name,
+                },
+                metric_name="RunningTaskCount",
+                namespace="ECS/ContainerInsights",
+                statistic="Minimum",
+                threshold=ecs_min_tasks,
+                period=ecs_period,
+                evaluation_periods=ecs_eval_periods,
+                # Container Insights stops emitting when a service is fully
+                # drained; that is exactly the failure we want to catch
+                treat_missing_data="breaching",
+                alarm_description=(
+                    f"Running task count below {ecs_min_tasks} on {svc_name}. "
+                    "Check: deployment status, service events for stop "
+                    "reasons, scheduled actions, task health"
+                ),
+                tags=project.common_tags,
+                opts=pulumi.ResourceOptions(depends_on=[alarm_topic, ecs_service]),
+            )
+
         # -----------------------------------------------------------------
         # Amazon MQ alarms
         # -----------------------------------------------------------------
@@ -1158,7 +1261,12 @@ def main():
             mq_period = mq_cfg.get("period", 300)
             mq_eval_periods = mq_cfg.get("evaluation_periods", 2)
 
-            broker_id = mq_broker.id
+            # AWS publishes Amazon MQ for RabbitMQ metrics with the `Broker`
+            # dimension set to the broker name, not the broker ID. The Pulumi
+            # `aws.mq.Broker.id` output is the AWS broker UUID (e.g.
+            # b-xxxxxxxx-...) which would point at a non-existent metric
+            # series
+            broker_name = mq_broker.broker_name
 
             aws.cloudwatch.MetricAlarm(
                 f"{project.name_prefix}-mq-message-ready",
@@ -1167,7 +1275,7 @@ def main():
                 ok_actions=[alarm_topic.arn],
                 comparison_operator="GreaterThanOrEqualToThreshold",
                 dimensions={
-                    "Broker": broker_id,
+                    "Broker": broker_name,
                     "VirtualHost": mq_vhost,
                     "Queue": mq_queue_name,
                 },
@@ -1195,7 +1303,7 @@ def main():
                     ok_actions=[alarm_topic.arn],
                     comparison_operator="LessThanThreshold",
                     dimensions={
-                        "Broker": broker_id,
+                        "Broker": broker_name,
                         "VirtualHost": mq_vhost,
                         "Queue": mq_queue_name,
                     },
@@ -1221,14 +1329,16 @@ def main():
                 alarm_actions=[alarm_topic.arn],
                 ok_actions=[alarm_topic.arn],
                 comparison_operator="GreaterThanOrEqualToThreshold",
-                dimensions={"Broker": broker_id},
+                dimensions={"Broker": broker_name},
                 metric_name="SystemCpuUtilization",
                 namespace="AWS/AmazonMQ",
                 statistic="Average",
                 threshold=mq_cpu_threshold,
                 period=mq_period,
                 evaluation_periods=mq_eval_periods,
-                treat_missing_data="notBreaching",
+                # Managed broker emits resource metrics whenever it is RUNNING;
+                # absence of data indicates the broker itself is in trouble.
+                treat_missing_data="breaching",
                 alarm_description=(
                     f"Broker CPU above {mq_cpu_threshold}%. Check: "
                     "queue depth, message throughput, consider "
@@ -1244,14 +1354,16 @@ def main():
                 alarm_actions=[alarm_topic.arn],
                 ok_actions=[alarm_topic.arn],
                 comparison_operator="GreaterThanOrEqualToThreshold",
-                dimensions={"Broker": broker_id},
+                dimensions={"Broker": broker_name},
                 metric_name="RabbitMQMemUsed",
                 namespace="AWS/AmazonMQ",
                 statistic="Average",
                 threshold=mq_mem_threshold,
                 period=mq_period,
                 evaluation_periods=mq_eval_periods,
-                treat_missing_data="notBreaching",
+                # Same rationale as mq-cpu: missing data on a managed broker
+                # is itself a failure signal.
+                treat_missing_data="breaching",
                 alarm_description=(
                     f"Broker memory above {mq_mem_threshold} bytes. "
                     "Check: queue depth and message sizes, consider "
@@ -1277,6 +1389,12 @@ def main():
             redis_eval_periods = redis_cfg.get("evaluation_periods", 2)
 
             replication_group = redis_cluster.resources["replication_group"]
+            # ElastiCache publishes per-node metrics under the cache cluster ID,
+            # which for a single-node replication group is `<rg-id>-001`. Verify
+            # at first deploy by reading one CloudWatch datapoint for the
+            # alarms below; if the dimension value does not match an emitted
+            # series, switch to `replication_group.member_clusters[0]` (a list
+            # output that holds the actual cache cluster IDs)
             cache_cluster_id = replication_group.id.apply(lambda rg_id: f"{rg_id}-001")
 
             aws.cloudwatch.MetricAlarm(
@@ -1442,7 +1560,9 @@ def main():
                 )
 
         if mq_broker is not None:
-            dashboard_outputs["mq_broker_id"] = mq_broker.id
+            # Dashboard widgets feed this value into the `Broker` CloudWatch
+            # dimension, which AWS keys by broker name (not the b-... UUID)
+            dashboard_outputs["mq_broker_name"] = mq_broker.broker_name
 
         if redis_cluster:
             dashboard_outputs["redis_cluster_id"] = redis_cluster.resources[
@@ -1704,7 +1824,7 @@ def main():
                                                     "AWS/AmazonMQ",
                                                     "MessageReadyCount",
                                                     "Broker",
-                                                    o["mq_broker_id"],
+                                                    o["mq_broker_name"],
                                                     "VirtualHost",
                                                     mq_vhost_dash,
                                                     "Queue",
@@ -1715,7 +1835,7 @@ def main():
                                                     "AWS/AmazonMQ",
                                                     "MessageUnacknowledgedCount",
                                                     "Broker",
-                                                    o["mq_broker_id"],
+                                                    o["mq_broker_name"],
                                                     "VirtualHost",
                                                     mq_vhost_dash,
                                                     "Queue",
@@ -1726,7 +1846,7 @@ def main():
                                                     "AWS/AmazonMQ",
                                                     "ConsumerCount",
                                                     "Broker",
-                                                    o["mq_broker_id"],
+                                                    o["mq_broker_name"],
                                                     "VirtualHost",
                                                     mq_vhost_dash,
                                                     "Queue",
@@ -1760,14 +1880,14 @@ def main():
                                                     "AWS/AmazonMQ",
                                                     "SystemCpuUtilization",
                                                     "Broker",
-                                                    o["mq_broker_id"],
+                                                    o["mq_broker_name"],
                                                     {"stat": "Average"},
                                                 ],
                                                 [
                                                     "AWS/AmazonMQ",
                                                     "RabbitMQMemUsed",
                                                     "Broker",
-                                                    o["mq_broker_id"],
+                                                    o["mq_broker_name"],
                                                     {
                                                         "stat": "Average",
                                                         "yAxis": "right",
@@ -1783,7 +1903,7 @@ def main():
                                         },
                                     },
                                 ]
-                                if "mq_broker_id" in o
+                                if "mq_broker_name" in o
                                 else []
                             ),
                             *(

From 7882889378eed35087e74ce4e8df5932e9533ade Mon Sep 17 00:00:00 2001
From: Jonathan Alvarez Delgado <jonathan.adl@proton.me>
Date: Wed, 29 Apr 2026 04:07:27 +0200
Subject: [PATCH 3/4] refactor(pulumi): extract dashboard widget builders to
 module level

---
 infra/pulumi/__main__.py | 676 +++++++++++++++------------------------
 1 file changed, 259 insertions(+), 417 deletions(-)

diff --git a/infra/pulumi/__main__.py b/infra/pulumi/__main__.py
index bcfb3369d3fa..03bc04653a1f 100755
--- a/infra/pulumi/__main__.py
+++ b/infra/pulumi/__main__.py
@@ -36,6 +36,263 @@
 import tb_pulumi.network
 
 
+# ---------------------------------------------------------------------------
+# CloudWatch dashboard widget builders
+#
+# Each builder takes the resolved-output dict `o` (string values produced by
+# pulumi.Output.all().apply) plus pure-Python layout config, and returns the
+# CloudWatch dashboard widget shape as a dict (or list of dicts)
+# They have no Pulumi deps and can be unit tested in isolation
+# ---------------------------------------------------------------------------
+
+
+def _alb_requests_widget(o, svc_name, region, period, x, y):
+    suffix = o[f"{svc_name}_alb_suffix"]
+    return {
+        "type": "metric",
+        "x": x,
+        "y": y,
+        "width": 12,
+        "height": 6,
+        "properties": {
+            "title": f"{svc_name.capitalize()} ALB - Requests and Errors",
+            "region": region,
+            "period": period,
+            "metrics": [
+                [
+                    "AWS/ApplicationELB",
+                    "RequestCount",
+                    "LoadBalancer",
+                    suffix,
+                    {"stat": "Sum"},
+                ],
+                [
+                    "AWS/ApplicationELB",
+                    "HTTPCode_ELB_5XX_Count",
+                    "LoadBalancer",
+                    suffix,
+                    {"stat": "Sum"},
+                ],
+                [
+                    "AWS/ApplicationELB",
+                    "HTTPCode_Target_5XX_Count",
+                    "LoadBalancer",
+                    suffix,
+                    {"stat": "Sum"},
+                ],
+                [
+                    "AWS/ApplicationELB",
+                    "TargetResponseTime",
+                    "LoadBalancer",
+                    suffix,
+                    {"stat": "Average", "yAxis": "right"},
+                ],
+            ],
+            "yAxis": {"right": {"label": "Seconds", "showUnits": False}},
+        },
+    }
+
+
+def _ecs_resources_widget(o, svc_name, region, period, x, y, width=8):
+    cluster = o[f"{svc_name}_cluster"]
+    service = o[f"{svc_name}_svc_name"]
+    return {
+        "type": "metric",
+        "x": x,
+        "y": y,
+        "width": width,
+        "height": 6,
+        "properties": {
+            "title": f"{svc_name.capitalize()} ECS - CPU and Memory",
+            "region": region,
+            "period": period,
+            "metrics": [
+                [
+                    "AWS/ECS",
+                    "CPUUtilization",
+                    "ClusterName",
+                    cluster,
+                    "ServiceName",
+                    service,
+                    {"stat": "Average"},
+                ],
+                [
+                    "AWS/ECS",
+                    "MemoryUtilization",
+                    "ClusterName",
+                    cluster,
+                    "ServiceName",
+                    service,
+                    {"stat": "Average"},
+                ],
+            ],
+        },
+    }
+
+
+def _mq_widgets(o, region, period, queue, vhost):
+    broker = o["mq_broker_name"]
+    queue_dims = ["Broker", broker, "VirtualHost", vhost, "Queue", queue]
+    return [
+        {
+            "type": "metric",
+            "x": 0,
+            "y": 12,
+            "width": 12,
+            "height": 6,
+            "properties": {
+                "title": "Amazon MQ - Queue Health",
+                "region": region,
+                "period": period,
+                "metrics": [
+                    [
+                        "AWS/AmazonMQ",
+                        "MessageReadyCount",
+                        *queue_dims,
+                        {"stat": "Average"},
+                    ],
+                    [
+                        "AWS/AmazonMQ",
+                        "MessageUnacknowledgedCount",
+                        *queue_dims,
+                        {"stat": "Average"},
+                    ],
+                    [
+                        "AWS/AmazonMQ",
+                        "ConsumerCount",
+                        *queue_dims,
+                        {"stat": "Minimum", "yAxis": "right"},
+                    ],
+                ],
+                "yAxis": {"right": {"label": "Consumers", "showUnits": False}},
+            },
+        },
+        {
+            "type": "metric",
+            "x": 12,
+            "y": 12,
+            "width": 12,
+            "height": 6,
+            "properties": {
+                "title": "Amazon MQ - Broker Resources",
+                "region": region,
+                "period": period,
+                "metrics": [
+                    [
+                        "AWS/AmazonMQ",
+                        "SystemCpuUtilization",
+                        "Broker",
+                        broker,
+                        {"stat": "Average"},
+                    ],
+                    [
+                        "AWS/AmazonMQ",
+                        "RabbitMQMemUsed",
+                        "Broker",
+                        broker,
+                        {"stat": "Average", "yAxis": "right"},
+                    ],
+                ],
+                "yAxis": {"right": {"label": "Bytes", "showUnits": False}},
+            },
+        },
+    ]
+
+
+def _redis_widgets(o, region, period):
+    cluster_id = o["redis_cluster_id"]
+    return [
+        {
+            "type": "metric",
+            "x": 0,
+            "y": 18,
+            "width": 12,
+            "height": 6,
+            "properties": {
+                "title": "Redis - Memory and Evictions",
+                "region": region,
+                "period": period,
+                "metrics": [
+                    [
+                        "AWS/ElastiCache",
+                        "DatabaseMemoryUsagePercentage",
+                        "CacheClusterId",
+                        cluster_id,
+                        {"stat": "Average"},
+                    ],
+                    [
+                        "AWS/ElastiCache",
+                        "Evictions",
+                        "CacheClusterId",
+                        cluster_id,
+                        {"stat": "Sum", "yAxis": "right"},
+                    ],
+                ],
+                "yAxis": {"right": {"label": "Count", "showUnits": False}},
+            },
+        },
+        {
+            "type": "metric",
+            "x": 12,
+            "y": 18,
+            "width": 12,
+            "height": 6,
+            "properties": {
+                "title": "Redis - CPU and Connections",
+                "region": region,
+                "period": period,
+                "metrics": [
+                    [
+                        "AWS/ElastiCache",
+                        "EngineCPUUtilization",
+                        "CacheClusterId",
+                        cluster_id,
+                        {"stat": "Average"},
+                    ],
+                    [
+                        "AWS/ElastiCache",
+                        "CPUUtilization",
+                        "CacheClusterId",
+                        cluster_id,
+                        {"stat": "Average"},
+                    ],
+                    [
+                        "AWS/ElastiCache",
+                        "CurrConnections",
+                        "CacheClusterId",
+                        cluster_id,
+                        {"stat": "Average", "yAxis": "right"},
+                    ],
+                ],
+                "yAxis": {"right": {"label": "Connections", "showUnits": False}},
+            },
+        },
+    ]
+
+
+def _build_dashboard_body(o, region, period, mq_queue, mq_vhost):
+    widgets = []
+    if "web_alb_suffix" in o:
+        widgets.append(_alb_requests_widget(o, "web", region, period, x=0, y=0))
+    if "versioncheck_alb_suffix" in o:
+        widgets.append(
+            _alb_requests_widget(o, "versioncheck", region, period, x=12, y=0)
+        )
+    if "web_cluster" in o and "web_svc_name" in o:
+        widgets.append(_ecs_resources_widget(o, "web", region, period, x=0, y=6))
+    if "worker_cluster" in o and "worker_svc_name" in o:
+        widgets.append(_ecs_resources_widget(o, "worker", region, period, x=8, y=6))
+    if "versioncheck_cluster" in o and "versioncheck_svc_name" in o:
+        widgets.append(
+            _ecs_resources_widget(o, "versioncheck", region, period, x=16, y=6)
+        )
+    if "mq_broker_name" in o:
+        widgets.extend(_mq_widgets(o, region, period, mq_queue, mq_vhost))
+    if "redis_cluster_id" in o:
+        widgets.extend(_redis_widgets(o, region, period))
+    return json.dumps({"widgets": widgets})
+
+
 def main():
     # Create a ThunderbirdPulumiProject to aggregate resources
     # This loads config.{stack}.yaml automatically
@@ -1575,423 +1832,8 @@ def main():
 
         if dashboard_outputs:
             dashboard_body = pulumi.Output.all(**dashboard_outputs).apply(
-                lambda o: json.dumps(
-                    {
-                        "widgets": [
-                            *(
-                                [
-                                    {
-                                        "type": "metric",
-                                        "x": 0,
-                                        "y": 0,
-                                        "width": 12,
-                                        "height": 6,
-                                        "properties": {
-                                            "title": "Web ALB - Requests and Errors",
-                                            "region": region,
-                                            "period": dash_period,
-                                            "metrics": [
-                                                [
-                                                    "AWS/ApplicationELB",
-                                                    "RequestCount",
-                                                    "LoadBalancer",
-                                                    o["web_alb_suffix"],
-                                                    {"stat": "Sum"},
-                                                ],
-                                                [
-                                                    "AWS/ApplicationELB",
-                                                    "HTTPCode_ELB_5XX_Count",
-                                                    "LoadBalancer",
-                                                    o["web_alb_suffix"],
-                                                    {"stat": "Sum"},
-                                                ],
-                                                [
-                                                    "AWS/ApplicationELB",
-                                                    "HTTPCode_Target_5XX_Count",
-                                                    "LoadBalancer",
-                                                    o["web_alb_suffix"],
-                                                    {"stat": "Sum"},
-                                                ],
-                                                [
-                                                    "AWS/ApplicationELB",
-                                                    "TargetResponseTime",
-                                                    "LoadBalancer",
-                                                    o["web_alb_suffix"],
-                                                    {
-                                                        "stat": "Average",
-                                                        "yAxis": "right",
-                                                    },
-                                                ],
-                                            ],
-                                            "yAxis": {
-                                                "right": {
-                                                    "label": "Seconds",
-                                                    "showUnits": False,
-                                                }
-                                            },
-                                        },
-                                    }
-                                ]
-                                if "web_alb_suffix" in o
-                                else []
-                            ),
-                            *(
-                                [
-                                    {
-                                        "type": "metric",
-                                        "x": 12,
-                                        "y": 0,
-                                        "width": 12,
-                                        "height": 6,
-                                        "properties": {
-                                            "title": "Versioncheck ALB - Requests and Errors",
-                                            "region": region,
-                                            "period": dash_period,
-                                            "metrics": [
-                                                [
-                                                    "AWS/ApplicationELB",
-                                                    "RequestCount",
-                                                    "LoadBalancer",
-                                                    o["versioncheck_alb_suffix"],
-                                                    {"stat": "Sum"},
-                                                ],
-                                                [
-                                                    "AWS/ApplicationELB",
-                                                    "HTTPCode_ELB_5XX_Count",
-                                                    "LoadBalancer",
-                                                    o["versioncheck_alb_suffix"],
-                                                    {"stat": "Sum"},
-                                                ],
-                                                [
-                                                    "AWS/ApplicationELB",
-                                                    "HTTPCode_Target_5XX_Count",
-                                                    "LoadBalancer",
-                                                    o["versioncheck_alb_suffix"],
-                                                    {"stat": "Sum"},
-                                                ],
-                                                [
-                                                    "AWS/ApplicationELB",
-                                                    "TargetResponseTime",
-                                                    "LoadBalancer",
-                                                    o["versioncheck_alb_suffix"],
-                                                    {
-                                                        "stat": "Average",
-                                                        "yAxis": "right",
-                                                    },
-                                                ],
-                                            ],
-                                            "yAxis": {
-                                                "right": {
-                                                    "label": "Seconds",
-                                                    "showUnits": False,
-                                                }
-                                            },
-                                        },
-                                    }
-                                ]
-                                if "versioncheck_alb_suffix" in o
-                                else []
-                            ),
-                            *(
-                                [
-                                    {
-                                        "type": "metric",
-                                        "x": 0,
-                                        "y": 6,
-                                        "width": 8,
-                                        "height": 6,
-                                        "properties": {
-                                            "title": "Web ECS - CPU and Memory",
-                                            "region": region,
-                                            "period": dash_period,
-                                            "metrics": [
-                                                [
-                                                    "AWS/ECS",
-                                                    "CPUUtilization",
-                                                    "ClusterName",
-                                                    o["web_cluster"],
-                                                    "ServiceName",
-                                                    o["web_svc_name"],
-                                                    {"stat": "Average"},
-                                                ],
-                                                [
-                                                    "AWS/ECS",
-                                                    "MemoryUtilization",
-                                                    "ClusterName",
-                                                    o["web_cluster"],
-                                                    "ServiceName",
-                                                    o["web_svc_name"],
-                                                    {"stat": "Average"},
-                                                ],
-                                            ],
-                                        },
-                                    }
-                                ]
-                                if "web_cluster" in o and "web_svc_name" in o
-                                else []
-                            ),
-                            *(
-                                [
-                                    {
-                                        "type": "metric",
-                                        "x": 8,
-                                        "y": 6,
-                                        "width": 8,
-                                        "height": 6,
-                                        "properties": {
-                                            "title": "Worker ECS - CPU and Memory",
-                                            "region": region,
-                                            "period": dash_period,
-                                            "metrics": [
-                                                [
-                                                    "AWS/ECS",
-                                                    "CPUUtilization",
-                                                    "ClusterName",
-                                                    o["worker_cluster"],
-                                                    "ServiceName",
-                                                    o["worker_svc_name"],
-                                                    {"stat": "Average"},
-                                                ],
-                                                [
-                                                    "AWS/ECS",
-                                                    "MemoryUtilization",
-                                                    "ClusterName",
-                                                    o["worker_cluster"],
-                                                    "ServiceName",
-                                                    o["worker_svc_name"],
-                                                    {"stat": "Average"},
-                                                ],
-                                            ],
-                                        },
-                                    }
-                                ]
-                                if "worker_cluster" in o and "worker_svc_name" in o
-                                else []
-                            ),
-                            *(
-                                [
-                                    {
-                                        "type": "metric",
-                                        "x": 16,
-                                        "y": 6,
-                                        "width": 8,
-                                        "height": 6,
-                                        "properties": {
-                                            "title": "Versioncheck ECS - CPU and Memory",
-                                            "region": region,
-                                            "period": dash_period,
-                                            "metrics": [
-                                                [
-                                                    "AWS/ECS",
-                                                    "CPUUtilization",
-                                                    "ClusterName",
-                                                    o["versioncheck_cluster"],
-                                                    "ServiceName",
-                                                    o["versioncheck_svc_name"],
-                                                    {"stat": "Average"},
-                                                ],
-                                                [
-                                                    "AWS/ECS",
-                                                    "MemoryUtilization",
-                                                    "ClusterName",
-                                                    o["versioncheck_cluster"],
-                                                    "ServiceName",
-                                                    o["versioncheck_svc_name"],
-                                                    {"stat": "Average"},
-                                                ],
-                                            ],
-                                        },
-                                    }
-                                ]
-                                if "versioncheck_cluster" in o
-                                and "versioncheck_svc_name" in o
-                                else []
-                            ),
-                            *(
-                                [
-                                    {
-                                        "type": "metric",
-                                        "x": 0,
-                                        "y": 12,
-                                        "width": 12,
-                                        "height": 6,
-                                        "properties": {
-                                            "title": "Amazon MQ - Queue Health",
-                                            "region": region,
-                                            "period": dash_period,
-                                            "metrics": [
-                                                [
-                                                    "AWS/AmazonMQ",
-                                                    "MessageReadyCount",
-                                                    "Broker",
-                                                    o["mq_broker_name"],
-                                                    "VirtualHost",
-                                                    mq_vhost_dash,
-                                                    "Queue",
-                                                    mq_queue,
-                                                    {"stat": "Average"},
-                                                ],
-                                                [
-                                                    "AWS/AmazonMQ",
-                                                    "MessageUnacknowledgedCount",
-                                                    "Broker",
-                                                    o["mq_broker_name"],
-                                                    "VirtualHost",
-                                                    mq_vhost_dash,
-                                                    "Queue",
-                                                    mq_queue,
-                                                    {"stat": "Average"},
-                                                ],
-                                                [
-                                                    "AWS/AmazonMQ",
-                                                    "ConsumerCount",
-                                                    "Broker",
-                                                    o["mq_broker_name"],
-                                                    "VirtualHost",
-                                                    mq_vhost_dash,
-                                                    "Queue",
-                                                    mq_queue,
-                                                    {
-                                                        "stat": "Minimum",
-                                                        "yAxis": "right",
-                                                    },
-                                                ],
-                                            ],
-                                            "yAxis": {
-                                                "right": {
-                                                    "label": "Consumers",
-                                                    "showUnits": False,
-                                                }
-                                            },
-                                        },
-                                    },
-                                    {
-                                        "type": "metric",
-                                        "x": 12,
-                                        "y": 12,
-                                        "width": 12,
-                                        "height": 6,
-                                        "properties": {
-                                            "title": "Amazon MQ - Broker Resources",
-                                            "region": region,
-                                            "period": dash_period,
-                                            "metrics": [
-                                                [
-                                                    "AWS/AmazonMQ",
-                                                    "SystemCpuUtilization",
-                                                    "Broker",
-                                                    o["mq_broker_name"],
-                                                    {"stat": "Average"},
-                                                ],
-                                                [
-                                                    "AWS/AmazonMQ",
-                                                    "RabbitMQMemUsed",
-                                                    "Broker",
-                                                    o["mq_broker_name"],
-                                                    {
-                                                        "stat": "Average",
-                                                        "yAxis": "right",
-                                                    },
-                                                ],
-                                            ],
-                                            "yAxis": {
-                                                "right": {
-                                                    "label": "Bytes",
-                                                    "showUnits": False,
-                                                }
-                                            },
-                                        },
-                                    },
-                                ]
-                                if "mq_broker_name" in o
-                                else []
-                            ),
-                            *(
-                                [
-                                    {
-                                        "type": "metric",
-                                        "x": 0,
-                                        "y": 18,
-                                        "width": 12,
-                                        "height": 6,
-                                        "properties": {
-                                            "title": "Redis - Memory and Evictions",
-                                            "region": region,
-                                            "period": dash_period,
-                                            "metrics": [
-                                                [
-                                                    "AWS/ElastiCache",
-                                                    "DatabaseMemoryUsagePercentage",
-                                                    "CacheClusterId",
-                                                    o["redis_cluster_id"],
-                                                    {"stat": "Average"},
-                                                ],
-                                                [
-                                                    "AWS/ElastiCache",
-                                                    "Evictions",
-                                                    "CacheClusterId",
-                                                    o["redis_cluster_id"],
-                                                    {"stat": "Sum", "yAxis": "right"},
-                                                ],
-                                            ],
-                                            "yAxis": {
-                                                "right": {
-                                                    "label": "Count",
-                                                    "showUnits": False,
-                                                }
-                                            },
-                                        },
-                                    },
-                                    {
-                                        "type": "metric",
-                                        "x": 12,
-                                        "y": 18,
-                                        "width": 12,
-                                        "height": 6,
-                                        "properties": {
-                                            "title": "Redis - CPU and Connections",
-                                            "region": region,
-                                            "period": dash_period,
-                                            "metrics": [
-                                                [
-                                                    "AWS/ElastiCache",
-                                                    "EngineCPUUtilization",
-                                                    "CacheClusterId",
-                                                    o["redis_cluster_id"],
-                                                    {"stat": "Average"},
-                                                ],
-                                                [
-                                                    "AWS/ElastiCache",
-                                                    "CPUUtilization",
-                                                    "CacheClusterId",
-                                                    o["redis_cluster_id"],
-                                                    {"stat": "Average"},
-                                                ],
-                                                [
-                                                    "AWS/ElastiCache",
-                                                    "CurrConnections",
-                                                    "CacheClusterId",
-                                                    o["redis_cluster_id"],
-                                                    {
-                                                        "stat": "Average",
-                                                        "yAxis": "right",
-                                                    },
-                                                ],
-                                            ],
-                                            "yAxis": {
-                                                "right": {
-                                                    "label": "Connections",
-                                                    "showUnits": False,
-                                                }
-                                            },
-                                        },
-                                    },
-                                ]
-                                if "redis_cluster_id" in o
-                                else []
-                            ),
-                        ],
-                    }
+                lambda o: _build_dashboard_body(
+                    o, region, dash_period, mq_queue, mq_vhost_dash
                 )
             )
 

From 1c797f462ba2223af2b1d0134792c8ad95100df5 Mon Sep 17 00:00:00 2001
From: Jonathan Alvarez Delgado <jonathan.adl@proton.me>
Date: Wed, 29 Apr 2026 05:12:05 +0200
Subject: [PATCH 4/4] fix(pulumi): use member_clusters and per-service
 min_tasks; add availability widgets

---
 infra/pulumi/__main__.py | 203 ++++++++++++++++++++++++++++++++-------
 1 file changed, 167 insertions(+), 36 deletions(-)

diff --git a/infra/pulumi/__main__.py b/infra/pulumi/__main__.py
index 03bc04653a1f..dfea9b89c460 100755
--- a/infra/pulumi/__main__.py
+++ b/infra/pulumi/__main__.py
@@ -47,7 +47,12 @@
 
 
 def _alb_requests_widget(o, svc_name, region, period, x, y):
-    suffix = o[f"{svc_name}_alb_suffix"]
+    # Target side metrics use LoadBalancer, TargetGroup to match the alarm
+    # dimensions; ALB side metrics keep LoadBalancer only because that is
+    # the dimension AWS publishes them under
+    lb = o[f"{svc_name}_alb_suffix"]
+    tg = o.get(f"{svc_name}_tg_suffix")
+    target_dims = ["LoadBalancer", lb] + (["TargetGroup", tg] if tg else [])
     return {
         "type": "metric",
         "x": x,
@@ -63,28 +68,26 @@ def _alb_requests_widget(o, svc_name, region, period, x, y):
                     "AWS/ApplicationELB",
                     "RequestCount",
                     "LoadBalancer",
-                    suffix,
+                    lb,
                     {"stat": "Sum"},
                 ],
                 [
                     "AWS/ApplicationELB",
                     "HTTPCode_ELB_5XX_Count",
                     "LoadBalancer",
-                    suffix,
+                    lb,
                     {"stat": "Sum"},
                 ],
                 [
                     "AWS/ApplicationELB",
                     "HTTPCode_Target_5XX_Count",
-                    "LoadBalancer",
-                    suffix,
+                    *target_dims,
                     {"stat": "Sum"},
                 ],
                 [
                     "AWS/ApplicationELB",
                     "TargetResponseTime",
-                    "LoadBalancer",
-                    suffix,
+                    *target_dims,
                     {"stat": "Average", "yAxis": "right"},
                 ],
             ],
@@ -130,14 +133,14 @@ def _ecs_resources_widget(o, svc_name, region, period, x, y, width=8):
     }
 
 
-def _mq_widgets(o, region, period, queue, vhost):
+def _mq_widgets(o, region, period, queue, vhost, y):
     broker = o["mq_broker_name"]
     queue_dims = ["Broker", broker, "VirtualHost", vhost, "Queue", queue]
     return [
         {
             "type": "metric",
             "x": 0,
-            "y": 12,
+            "y": y,
             "width": 12,
             "height": 6,
             "properties": {
@@ -170,7 +173,7 @@ def _mq_widgets(o, region, period, queue, vhost):
         {
             "type": "metric",
             "x": 12,
-            "y": 12,
+            "y": y,
             "width": 12,
             "height": 6,
             "properties": {
@@ -199,13 +202,13 @@ def _mq_widgets(o, region, period, queue, vhost):
     ]
 
 
-def _redis_widgets(o, region, period):
+def _redis_widgets(o, region, period, y):
     cluster_id = o["redis_cluster_id"]
     return [
         {
             "type": "metric",
             "x": 0,
-            "y": 18,
+            "y": y,
             "width": 12,
             "height": 6,
             "properties": {
@@ -234,7 +237,7 @@ def _redis_widgets(o, region, period):
         {
             "type": "metric",
             "x": 12,
-            "y": 18,
+            "y": y,
             "width": 12,
             "height": 6,
             "properties": {
@@ -270,26 +273,140 @@ def _redis_widgets(o, region, period):
     ]
 
 
+def _availability_widgets(o, region, period, mq_queue, mq_vhost, y):
+    # "Is it alive?" panels rendered at the top of the dashboard so positive
+    # availability reads first. Each sub-widget is conditional on the keys
+    # for its source being present in `o`
+    widgets = []
+
+    alb_metrics = []
+    for svc_name in ("web", "versioncheck"):
+        lb = o.get(f"{svc_name}_alb_suffix")
+        tg = o.get(f"{svc_name}_tg_suffix")
+        if lb and tg:
+            alb_metrics.append(
+                [
+                    "AWS/ApplicationELB",
+                    "HealthyHostCount",
+                    "TargetGroup",
+                    tg,
+                    "LoadBalancer",
+                    lb,
+                    {"stat": "Minimum", "label": f"{svc_name} healthy"},
+                ]
+            )
+    if alb_metrics:
+        widgets.append(
+            {
+                "type": "metric",
+                "x": 0,
+                "y": y,
+                "width": 8,
+                "height": 6,
+                "properties": {
+                    "title": "ALB - Healthy Hosts",
+                    "region": region,
+                    "period": period,
+                    "stat": "Minimum",
+                    "metrics": alb_metrics,
+                },
+            }
+        )
+
+    ecs_metrics = []
+    for svc_name in ("web", "worker", "versioncheck"):
+        cluster = o.get(f"{svc_name}_cluster")
+        service = o.get(f"{svc_name}_svc_name")
+        if cluster and service:
+            ecs_metrics.append(
+                [
+                    "ECS/ContainerInsights",
+                    "RunningTaskCount",
+                    "ClusterName",
+                    cluster,
+                    "ServiceName",
+                    service,
+                    {"stat": "Minimum", "label": f"{svc_name} running"},
+                ]
+            )
+    if ecs_metrics:
+        widgets.append(
+            {
+                "type": "metric",
+                "x": 8,
+                "y": y,
+                "width": 8,
+                "height": 6,
+                "properties": {
+                    "title": "ECS - Running Tasks",
+                    "region": region,
+                    "period": period,
+                    "stat": "Minimum",
+                    "metrics": ecs_metrics,
+                },
+            }
+        )
+
+    if "mq_broker_name" in o:
+        broker = o["mq_broker_name"]
+        widgets.append(
+            {
+                "type": "metric",
+                "x": 16,
+                "y": y,
+                "width": 8,
+                "height": 6,
+                "properties": {
+                    "title": f"MQ - Consumers on '{mq_queue}'",
+                    "region": region,
+                    "period": period,
+                    "stat": "Minimum",
+                    "metrics": [
+                        [
+                            "AWS/AmazonMQ",
+                            "ConsumerCount",
+                            "Broker",
+                            broker,
+                            "VirtualHost",
+                            mq_vhost,
+                            "Queue",
+                            mq_queue,
+                            {"stat": "Minimum"},
+                        ],
+                    ],
+                },
+            }
+        )
+
+    return widgets
+
+
 def _build_dashboard_body(o, region, period, mq_queue, mq_vhost):
     widgets = []
+    # Row 0: availability - is it alive?
+    widgets.extend(_availability_widgets(o, region, period, mq_queue, mq_vhost, y=0))
+    # Row 1: ALB requests/errors per service
     if "web_alb_suffix" in o:
-        widgets.append(_alb_requests_widget(o, "web", region, period, x=0, y=0))
+        widgets.append(_alb_requests_widget(o, "web", region, period, x=0, y=6))
     if "versioncheck_alb_suffix" in o:
         widgets.append(
-            _alb_requests_widget(o, "versioncheck", region, period, x=12, y=0)
+            _alb_requests_widget(o, "versioncheck", region, period, x=12, y=6)
         )
+    # Row 2: ECS CPU/memory per service
     if "web_cluster" in o and "web_svc_name" in o:
-        widgets.append(_ecs_resources_widget(o, "web", region, period, x=0, y=6))
+        widgets.append(_ecs_resources_widget(o, "web", region, period, x=0, y=12))
     if "worker_cluster" in o and "worker_svc_name" in o:
-        widgets.append(_ecs_resources_widget(o, "worker", region, period, x=8, y=6))
+        widgets.append(_ecs_resources_widget(o, "worker", region, period, x=8, y=12))
     if "versioncheck_cluster" in o and "versioncheck_svc_name" in o:
         widgets.append(
-            _ecs_resources_widget(o, "versioncheck", region, period, x=16, y=6)
+            _ecs_resources_widget(o, "versioncheck", region, period, x=16, y=12)
         )
+    # Row 3: Amazon MQ
     if "mq_broker_name" in o:
-        widgets.extend(_mq_widgets(o, region, period, mq_queue, mq_vhost))
+        widgets.extend(_mq_widgets(o, region, period, mq_queue, mq_vhost, y=18))
+    # Row 4: Redis
     if "redis_cluster_id" in o:
-        widgets.extend(_redis_widgets(o, region, period))
+        widgets.extend(_redis_widgets(o, region, period, y=24))
     return json.dumps({"widgets": widgets})
 
 
@@ -1144,7 +1261,7 @@ def main():
         )
 
     # =========================================================================
-    # Monitoring and Alarms (prod-gating baseline)
+    # Monitoring and Alarms (stage env-gating baseline)
     # =========================================================================
     # Phase 1 observability: SNS notification path, CloudWatch alarms for
     # ALB/TG/ECS/MQ/Redis, and one operational dashboard
@@ -1405,7 +1522,8 @@ def main():
         ecs_mem_threshold = ecs_cfg.get("memory_threshold", 80)
         ecs_period = ecs_cfg.get("period", 300)
         ecs_eval_periods = ecs_cfg.get("evaluation_periods", 2)
-        ecs_min_tasks = ecs_cfg.get("min_tasks", 1)
+        ecs_min_tasks_default = ecs_cfg.get("min_tasks", 1)
+        ecs_min_tasks_per_svc = ecs_cfg.get("min_tasks_per_service", {})
 
         for svc_name, fargate_svc in fargate_services.items():
             ecs_service = fargate_svc.resources.get("service")
@@ -1470,10 +1588,19 @@ def main():
                 opts=pulumi.ResourceOptions(depends_on=[alarm_topic, ecs_service]),
             )
 
-            # Container Insights publishes RunningTaskCount per service in the
-            # ECS/ContainerInsights namespace. Operators draining a service
-            # intentionally should override `min_tasks` per service in config
-            # or temporarily disable this alarm
+            # RunningTaskCount lives in the ECS/ContainerInsights namespace,
+            # which is only populated when Container Insights is enabled on
+            # the cluster. tb_pulumi.fargate enables it via the
+            # `enable_container_insights: true` flag in config.stage.yaml. If
+            # that ever flips to false, this alarm goes immediately to ALARM
+            # because of the breaching missing-data treatment below - that
+            # surfaces the misconfiguration loudly and not silently
+            # disabling availability monitoring
+            #
+            # Operators intentionally draining a service should override
+            # `min_tasks_per_service` in config (e.g., `worker: 0`) or
+            # temporarily disable this alarm
+            min_tasks = ecs_min_tasks_per_svc.get(svc_name, ecs_min_tasks_default)
             aws.cloudwatch.MetricAlarm(
                 f"{project.name_prefix}-{svc_name}-running-tasks",
                 name=f"{project.name_prefix}-{svc_name}-running-tasks",
@@ -1487,14 +1614,14 @@ def main():
                 metric_name="RunningTaskCount",
                 namespace="ECS/ContainerInsights",
                 statistic="Minimum",
-                threshold=ecs_min_tasks,
+                threshold=min_tasks,
                 period=ecs_period,
                 evaluation_periods=ecs_eval_periods,
                 # Container Insights stops emitting when a service is fully
                 # drained; that is exactly the failure we want to catch
                 treat_missing_data="breaching",
                 alarm_description=(
-                    f"Running task count below {ecs_min_tasks} on {svc_name}. "
+                    f"Running task count below {min_tasks} on {svc_name}. "
                     "Check: deployment status, service events for stop "
                     "reasons, scheduled actions, task health"
                 ),
@@ -1646,13 +1773,14 @@ def main():
             redis_eval_periods = redis_cfg.get("evaluation_periods", 2)
 
             replication_group = redis_cluster.resources["replication_group"]
-            # ElastiCache publishes per-node metrics under the cache cluster ID,
-            # which for a single-node replication group is `<rg-id>-001`. Verify
-            # at first deploy by reading one CloudWatch datapoint for the
-            # alarms below; if the dimension value does not match an emitted
-            # series, switch to `replication_group.member_clusters[0]` (a list
-            # output that holds the actual cache cluster IDs)
-            cache_cluster_id = replication_group.id.apply(lambda rg_id: f"{rg_id}-001")
+            # ElastiCache publishes per-node metrics under the cache cluster ID.
+            # Use the provider's actual member_clusters output rather than
+            # reconstructing the AWS naming convention (`<rg-id>-001`); for our
+            # single-node replication group this resolves to the same value but
+            # is robust against multi-node setups and AWS naming changes
+            cache_cluster_id = replication_group.member_clusters.apply(
+                lambda clusters: clusters[0]
+            )
 
             aws.cloudwatch.MetricAlarm(
                 f"{project.name_prefix}-redis-memory",
@@ -1794,8 +1922,11 @@ def main():
                 svc_alb = fargate_svc.resources.get("fargate_service_alb")
                 if svc_alb:
                     alb = svc_alb.resources["albs"].get(svc_name)
+                    tg = svc_alb.resources["target_groups"].get(svc_name)
                     if alb:
                         dashboard_outputs[f"{svc_name}_alb_suffix"] = alb.arn_suffix
+                    if tg:
+                        dashboard_outputs[f"{svc_name}_tg_suffix"] = tg.arn_suffix
                 svc_res = fargate_svc.resources.get("service")
                 cluster_res = fargate_svc.resources.get("cluster")
                 if svc_res:
@@ -1824,7 +1955,7 @@ def main():
         if redis_cluster:
             dashboard_outputs["redis_cluster_id"] = redis_cluster.resources[
                 "replication_group"
-            ].id.apply(lambda rg_id: f"{rg_id}-001")
+            ].member_clusters.apply(lambda clusters: clusters[0])
 
         mq_queue = alarm_cfg.get("mq", {}).get("queue_name", "olympia")
         mq_vhost_dash = alarm_cfg.get("mq", {}).get("virtual_host", "/")