diff --git a/infra/pulumi/__main__.py b/infra/pulumi/__main__.py index 9baacf48f1f3..dfea9b89c460 100755 --- a/infra/pulumi/__main__.py +++ b/infra/pulumi/__main__.py @@ -36,6 +36,380 @@ import tb_pulumi.network +# --------------------------------------------------------------------------- +# CloudWatch dashboard widget builders +# +# Each builder takes the resolved-output dict `o` (string values produced by +# pulumi.Output.all().apply) plus pure-Python layout config, and returns the +# CloudWatch dashboard widget shape as a dict (or list of dicts) +# They have no Pulumi deps and can be unit tested in isolation +# --------------------------------------------------------------------------- + + +def _alb_requests_widget(o, svc_name, region, period, x, y): + # Target side metrics use LoadBalancer, TargetGroup to match the alarm + # dimensions; ALB side metrics keep LoadBalancer only because that is + # the dimension AWS publishes them under + lb = o[f"{svc_name}_alb_suffix"] + tg = o.get(f"{svc_name}_tg_suffix") + target_dims = ["LoadBalancer", lb] + (["TargetGroup", tg] if tg else []) + return { + "type": "metric", + "x": x, + "y": y, + "width": 12, + "height": 6, + "properties": { + "title": f"{svc_name.capitalize()} ALB - Requests and Errors", + "region": region, + "period": period, + "metrics": [ + [ + "AWS/ApplicationELB", + "RequestCount", + "LoadBalancer", + lb, + {"stat": "Sum"}, + ], + [ + "AWS/ApplicationELB", + "HTTPCode_ELB_5XX_Count", + "LoadBalancer", + lb, + {"stat": "Sum"}, + ], + [ + "AWS/ApplicationELB", + "HTTPCode_Target_5XX_Count", + *target_dims, + {"stat": "Sum"}, + ], + [ + "AWS/ApplicationELB", + "TargetResponseTime", + *target_dims, + {"stat": "Average", "yAxis": "right"}, + ], + ], + "yAxis": {"right": {"label": "Seconds", "showUnits": False}}, + }, + } + + +def _ecs_resources_widget(o, svc_name, region, period, x, y, width=8): + cluster = o[f"{svc_name}_cluster"] + service = o[f"{svc_name}_svc_name"] + return { + "type": "metric", + "x": x, + "y": y, + "width": width, + "height": 6, + "properties": { + "title": f"{svc_name.capitalize()} ECS - CPU and Memory", + "region": region, + "period": period, + "metrics": [ + [ + "AWS/ECS", + "CPUUtilization", + "ClusterName", + cluster, + "ServiceName", + service, + {"stat": "Average"}, + ], + [ + "AWS/ECS", + "MemoryUtilization", + "ClusterName", + cluster, + "ServiceName", + service, + {"stat": "Average"}, + ], + ], + }, + } + + +def _mq_widgets(o, region, period, queue, vhost, y): + broker = o["mq_broker_name"] + queue_dims = ["Broker", broker, "VirtualHost", vhost, "Queue", queue] + return [ + { + "type": "metric", + "x": 0, + "y": y, + "width": 12, + "height": 6, + "properties": { + "title": "Amazon MQ - Queue Health", + "region": region, + "period": period, + "metrics": [ + [ + "AWS/AmazonMQ", + "MessageReadyCount", + *queue_dims, + {"stat": "Average"}, + ], + [ + "AWS/AmazonMQ", + "MessageUnacknowledgedCount", + *queue_dims, + {"stat": "Average"}, + ], + [ + "AWS/AmazonMQ", + "ConsumerCount", + *queue_dims, + {"stat": "Minimum", "yAxis": "right"}, + ], + ], + "yAxis": {"right": {"label": "Consumers", "showUnits": False}}, + }, + }, + { + "type": "metric", + "x": 12, + "y": y, + "width": 12, + "height": 6, + "properties": { + "title": "Amazon MQ - Broker Resources", + "region": region, + "period": period, + "metrics": [ + [ + "AWS/AmazonMQ", + "SystemCpuUtilization", + "Broker", + broker, + {"stat": "Average"}, + ], + [ + "AWS/AmazonMQ", + "RabbitMQMemUsed", + "Broker", + broker, + {"stat": "Average", "yAxis": "right"}, + ], + ], + "yAxis": {"right": {"label": "Bytes", "showUnits": False}}, + }, + }, + ] + + +def _redis_widgets(o, region, period, y): + cluster_id = o["redis_cluster_id"] + return [ + { + "type": "metric", + "x": 0, + "y": y, + "width": 12, + "height": 6, + "properties": { + "title": "Redis - Memory and Evictions", + "region": region, + "period": period, + "metrics": [ + [ + "AWS/ElastiCache", + "DatabaseMemoryUsagePercentage", + "CacheClusterId", + cluster_id, + {"stat": "Average"}, + ], + [ + "AWS/ElastiCache", + "Evictions", + "CacheClusterId", + cluster_id, + {"stat": "Sum", "yAxis": "right"}, + ], + ], + "yAxis": {"right": {"label": "Count", "showUnits": False}}, + }, + }, + { + "type": "metric", + "x": 12, + "y": y, + "width": 12, + "height": 6, + "properties": { + "title": "Redis - CPU and Connections", + "region": region, + "period": period, + "metrics": [ + [ + "AWS/ElastiCache", + "EngineCPUUtilization", + "CacheClusterId", + cluster_id, + {"stat": "Average"}, + ], + [ + "AWS/ElastiCache", + "CPUUtilization", + "CacheClusterId", + cluster_id, + {"stat": "Average"}, + ], + [ + "AWS/ElastiCache", + "CurrConnections", + "CacheClusterId", + cluster_id, + {"stat": "Average", "yAxis": "right"}, + ], + ], + "yAxis": {"right": {"label": "Connections", "showUnits": False}}, + }, + }, + ] + + +def _availability_widgets(o, region, period, mq_queue, mq_vhost, y): + # "Is it alive?" panels rendered at the top of the dashboard so positive + # availability reads first. Each sub-widget is conditional on the keys + # for its source being present in `o` + widgets = [] + + alb_metrics = [] + for svc_name in ("web", "versioncheck"): + lb = o.get(f"{svc_name}_alb_suffix") + tg = o.get(f"{svc_name}_tg_suffix") + if lb and tg: + alb_metrics.append( + [ + "AWS/ApplicationELB", + "HealthyHostCount", + "TargetGroup", + tg, + "LoadBalancer", + lb, + {"stat": "Minimum", "label": f"{svc_name} healthy"}, + ] + ) + if alb_metrics: + widgets.append( + { + "type": "metric", + "x": 0, + "y": y, + "width": 8, + "height": 6, + "properties": { + "title": "ALB - Healthy Hosts", + "region": region, + "period": period, + "stat": "Minimum", + "metrics": alb_metrics, + }, + } + ) + + ecs_metrics = [] + for svc_name in ("web", "worker", "versioncheck"): + cluster = o.get(f"{svc_name}_cluster") + service = o.get(f"{svc_name}_svc_name") + if cluster and service: + ecs_metrics.append( + [ + "ECS/ContainerInsights", + "RunningTaskCount", + "ClusterName", + cluster, + "ServiceName", + service, + {"stat": "Minimum", "label": f"{svc_name} running"}, + ] + ) + if ecs_metrics: + widgets.append( + { + "type": "metric", + "x": 8, + "y": y, + "width": 8, + "height": 6, + "properties": { + "title": "ECS - Running Tasks", + "region": region, + "period": period, + "stat": "Minimum", + "metrics": ecs_metrics, + }, + } + ) + + if "mq_broker_name" in o: + broker = o["mq_broker_name"] + widgets.append( + { + "type": "metric", + "x": 16, + "y": y, + "width": 8, + "height": 6, + "properties": { + "title": f"MQ - Consumers on '{mq_queue}'", + "region": region, + "period": period, + "stat": "Minimum", + "metrics": [ + [ + "AWS/AmazonMQ", + "ConsumerCount", + "Broker", + broker, + "VirtualHost", + mq_vhost, + "Queue", + mq_queue, + {"stat": "Minimum"}, + ], + ], + }, + } + ) + + return widgets + + +def _build_dashboard_body(o, region, period, mq_queue, mq_vhost): + widgets = [] + # Row 0: availability - is it alive? + widgets.extend(_availability_widgets(o, region, period, mq_queue, mq_vhost, y=0)) + # Row 1: ALB requests/errors per service + if "web_alb_suffix" in o: + widgets.append(_alb_requests_widget(o, "web", region, period, x=0, y=6)) + if "versioncheck_alb_suffix" in o: + widgets.append( + _alb_requests_widget(o, "versioncheck", region, period, x=12, y=6) + ) + # Row 2: ECS CPU/memory per service + if "web_cluster" in o and "web_svc_name" in o: + widgets.append(_ecs_resources_widget(o, "web", region, period, x=0, y=12)) + if "worker_cluster" in o and "worker_svc_name" in o: + widgets.append(_ecs_resources_widget(o, "worker", region, period, x=8, y=12)) + if "versioncheck_cluster" in o and "versioncheck_svc_name" in o: + widgets.append( + _ecs_resources_widget(o, "versioncheck", region, period, x=16, y=12) + ) + # Row 3: Amazon MQ + if "mq_broker_name" in o: + widgets.extend(_mq_widgets(o, region, period, mq_queue, mq_vhost, y=18)) + # Row 4: Redis + if "redis_cluster_id" in o: + widgets.extend(_redis_widgets(o, region, period, y=24)) + return json.dumps({"widgets": widgets}) + + def main(): # Create a ThunderbirdPulumiProject to aggregate resources # This loads config.{stack}.yaml automatically @@ -768,6 +1142,7 @@ def main(): # Dedicated stage broker replacing the production EC2 RabbitMQ that # atn/stage/celery_broker previously pointed to (issue #375) mq_config = resources.get("aws:mq:RabbitMQBroker", {}) + mq_broker = None if mq_config and private_subnets and vpc_resource: mq_creds_secret_name = mq_config.get("credentials_secret_name") @@ -885,6 +1260,730 @@ def main(): ), ) + # ========================================================================= + # Monitoring and Alarms (stage env-gating baseline) + # ========================================================================= + # Phase 1 observability: SNS notification path, CloudWatch alarms for + # ALB/TG/ECS/MQ/Redis, and one operational dashboard + # + # All alarms are written explicitly (not via CloudWatchMonitoringGroup) + # for a single SNS topic, full control over alarm descriptions, and + # correct metric names (upstream tb_pulumi has a target_5xx metric bug) + # + # Thresholds live in config.stage.yaml under resources.monitoring.alarms + monitoring_cfg = resources.get("monitoring", {}) + alarm_cfg = monitoring_cfg.get("alarms", {}) + + if monitoring_cfg and fargate_services: + notify_secret_name = monitoring_cfg.get("notify_emails_secret_name") + notify_emails = [] + if notify_secret_name: + notify_emails_raw = aws.secretsmanager.get_secret_version( + secret_id=notify_secret_name, + ) + notify_emails = [ + e.strip() + for e in notify_emails_raw.secret_string.split(",") + if e.strip() + ] + + # ----------------------------------------------------------------- + # SNS topic + email subscriptions + # ----------------------------------------------------------------- + alarm_topic = aws.sns.Topic( + f"{project.name_prefix}-alarm-topic", + name=f"{project.name_prefix}-alarms", + tags={ + **project.common_tags, + "Name": f"{project.name_prefix}-alarms", + }, + ) + + for idx, email in enumerate(notify_emails): + aws.sns.TopicSubscription( + f"{project.name_prefix}-alarm-sub-{idx}", + protocol="email", + endpoint=email, + topic=alarm_topic.arn, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic]), + ) + + # If notifications fail to deliver, every other alarm in this stack is + # also silently undelivered. We publish this alarm to the same topic + # for CloudWatch-console visibility; for Phase 2 a secondary channel + # (SMS, Slack, separate topic) should provide an independent path + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-alarm-topic-delivery-failures", + name=f"{project.name_prefix}-alarm-topic-delivery-failures", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"TopicName": alarm_topic.name}, + metric_name="NumberOfNotificationsFailed", + namespace="AWS/SNS", + statistic="Sum", + threshold=1, + period=300, + evaluation_periods=1, + treat_missing_data="notBreaching", + alarm_description=( + "One or more alarm notifications failed delivery from the " + "stage alarm topic. Check: SNS subscription confirmations, " + "recipient email validity, topic policy" + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic]), + ) + + # ----------------------------------------------------------------- + # ALB alarms (web, versioncheck) + # ----------------------------------------------------------------- + alb_cfg = alarm_cfg.get("alb", {}) + alb_error_threshold = alb_cfg.get("error_threshold", 10) + alb_error_period = alb_cfg.get("error_period", 60) + alb_rt_threshold = alb_cfg.get("response_time_threshold", 1) + alb_rt_period = alb_cfg.get("response_time_period", 60) + alb_eval_periods = alb_cfg.get("evaluation_periods", 2) + + for svc_name in ["web", "versioncheck"]: + fargate_svc = fargate_services.get(svc_name) + if not fargate_svc: + continue + svc_alb = fargate_svc.resources.get("fargate_service_alb") + if not svc_alb: + continue + alb = svc_alb.resources["albs"].get(svc_name) + tg = svc_alb.resources["target_groups"].get(svc_name) + if not alb or not tg: + continue + + lb_suffix = alb.arn_suffix + tg_suffix = tg.arn_suffix + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-alb-5xx", + name=f"{project.name_prefix}-{svc_name}-alb-5xx", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"LoadBalancer": lb_suffix}, + metric_name="HTTPCode_ELB_5XX_Count", + namespace="AWS/ApplicationELB", + statistic="Sum", + threshold=alb_error_threshold, + period=alb_error_period, + evaluation_periods=alb_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Elevated 5xx errors on the {svc_name} ALB. " + "Check: ECS task health in console, then " + "application logs in CloudWatch for stack traces." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, alb]), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-target-5xx", + name=f"{project.name_prefix}-{svc_name}-target-5xx", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={ + "LoadBalancer": lb_suffix, + "TargetGroup": tg_suffix, + }, + metric_name="HTTPCode_Target_5XX_Count", + namespace="AWS/ApplicationELB", + statistic="Sum", + threshold=alb_error_threshold, + period=alb_error_period, + evaluation_periods=alb_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Elevated 5xx errors from {svc_name} application targets. " + "Check: application logs for exceptions, database " + "connectivity, and upstream dependency health." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, alb]), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-response-time", + name=f"{project.name_prefix}-{svc_name}-response-time", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={ + "LoadBalancer": lb_suffix, + "TargetGroup": tg_suffix, + }, + metric_name="TargetResponseTime", + namespace="AWS/ApplicationELB", + statistic="Average", + threshold=alb_rt_threshold, + period=alb_rt_period, + evaluation_periods=alb_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Average response time above {alb_rt_threshold}s on {svc_name}. " + "Check: is traffic elevated? Are database queries slow? " + "Is Memcached reachable?" + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, alb]), + ) + + # ----------------------------------------------------------------- + # Target group alarms (web, versioncheck) + # ----------------------------------------------------------------- + tg_cfg = alarm_cfg.get("target_group", {}) + tg_unhealthy_threshold = tg_cfg.get("unhealthy_threshold", 1) + tg_period = tg_cfg.get("period", 60) + tg_eval_periods = tg_cfg.get("evaluation_periods", 2) + + for svc_name in ["web", "versioncheck"]: + fargate_svc = fargate_services.get(svc_name) + if not fargate_svc: + continue + svc_alb = fargate_svc.resources.get("fargate_service_alb") + if not svc_alb: + continue + alb = svc_alb.resources["albs"].get(svc_name) + tg = svc_alb.resources["target_groups"].get(svc_name) + if not alb or not tg: + continue + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-unhealthy-hosts", + name=f"{project.name_prefix}-{svc_name}-unhealthy-hosts", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={ + "TargetGroup": tg.arn_suffix, + "LoadBalancer": alb.arn_suffix, + }, + metric_name="UnHealthyHostCount", + namespace="AWS/ApplicationELB", + statistic="Average", + threshold=tg_unhealthy_threshold, + period=tg_period, + evaluation_periods=tg_eval_periods, + # Positive availability is covered by the healthy-hosts alarm + # below; here we want elevated unhealthy hosts even when at + # least one single healthy host remains + treat_missing_data="notBreaching", + alarm_description=( + f"Unhealthy hosts detected in {svc_name} target group. " + "Check: ECS task status, health check endpoint " + "(/services/monitor.json), container logs." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, tg]), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-healthy-hosts", + name=f"{project.name_prefix}-{svc_name}-healthy-hosts", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="LessThanThreshold", + dimensions={ + "TargetGroup": tg.arn_suffix, + "LoadBalancer": alb.arn_suffix, + }, + metric_name="HealthyHostCount", + namespace="AWS/ApplicationELB", + statistic="Minimum", + threshold=tg_cfg.get("healthy_threshold", 1), + period=tg_period, + evaluation_periods=tg_eval_periods, + # Missing data on this metric means the target group has no + # registered targets -- operationally indistinguishable from + # zero healthy hosts and therefore treated as breaching + treat_missing_data="breaching", + alarm_description=( + f"No healthy hosts in {svc_name} target group. " + "Check: is the ECS service running, is the container " + "health-check responding (/services/monitor.json), is " + "the SG allowing traffic from the ALB?" + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, tg]), + ) + + # ----------------------------------------------------------------- + # ECS service alarms (web, worker, versioncheck) + # ----------------------------------------------------------------- + ecs_cfg = alarm_cfg.get("ecs", {}) + ecs_cpu_threshold = ecs_cfg.get("cpu_threshold", 80) + ecs_mem_threshold = ecs_cfg.get("memory_threshold", 80) + ecs_period = ecs_cfg.get("period", 300) + ecs_eval_periods = ecs_cfg.get("evaluation_periods", 2) + ecs_min_tasks_default = ecs_cfg.get("min_tasks", 1) + ecs_min_tasks_per_svc = ecs_cfg.get("min_tasks_per_service", {}) + + for svc_name, fargate_svc in fargate_services.items(): + ecs_service = fargate_svc.resources.get("service") + ecs_cluster = fargate_svc.resources.get("cluster") + if not ecs_service or not ecs_cluster: + continue + + cluster_name = ecs_cluster.arn.apply(lambda arn: arn.split("/")[-1]) + service_name = ecs_service.name + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-ecs-cpu", + name=f"{project.name_prefix}-{svc_name}-ecs-cpu", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={ + "ClusterName": cluster_name, + "ServiceName": service_name, + }, + metric_name="CPUUtilization", + namespace="AWS/ECS", + statistic="Average", + threshold=ecs_cpu_threshold, + period=ecs_period, + evaluation_periods=ecs_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"CPU utilisation above {ecs_cpu_threshold}% on {svc_name} service. " + "Check: is traffic elevated? Are tasks stuck? " + "Consider scaling if sustained." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, ecs_service]), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-ecs-memory", + name=f"{project.name_prefix}-{svc_name}-ecs-memory", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={ + "ClusterName": cluster_name, + "ServiceName": service_name, + }, + metric_name="MemoryUtilization", + namespace="AWS/ECS", + statistic="Average", + threshold=ecs_mem_threshold, + period=ecs_period, + evaluation_periods=ecs_eval_periods, + # Positive availability is covered by the running-tasks alarm + # below; CPU/memory only matter while tasks exist + treat_missing_data="notBreaching", + alarm_description=( + f"Memory utilisation above {ecs_mem_threshold}% on {svc_name} service. " + "Check: application memory leaks, task resource limits, " + "consider scaling." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, ecs_service]), + ) + + # RunningTaskCount lives in the ECS/ContainerInsights namespace, + # which is only populated when Container Insights is enabled on + # the cluster. tb_pulumi.fargate enables it via the + # `enable_container_insights: true` flag in config.stage.yaml. If + # that ever flips to false, this alarm goes immediately to ALARM + # because of the breaching missing-data treatment below - that + # surfaces the misconfiguration loudly and not silently + # disabling availability monitoring + # + # Operators intentionally draining a service should override + # `min_tasks_per_service` in config (e.g., `worker: 0`) or + # temporarily disable this alarm + min_tasks = ecs_min_tasks_per_svc.get(svc_name, ecs_min_tasks_default) + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-{svc_name}-running-tasks", + name=f"{project.name_prefix}-{svc_name}-running-tasks", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="LessThanThreshold", + dimensions={ + "ClusterName": cluster_name, + "ServiceName": service_name, + }, + metric_name="RunningTaskCount", + namespace="ECS/ContainerInsights", + statistic="Minimum", + threshold=min_tasks, + period=ecs_period, + evaluation_periods=ecs_eval_periods, + # Container Insights stops emitting when a service is fully + # drained; that is exactly the failure we want to catch + treat_missing_data="breaching", + alarm_description=( + f"Running task count below {min_tasks} on {svc_name}. " + "Check: deployment status, service events for stop " + "reasons, scheduled actions, task health" + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, ecs_service]), + ) + + # ----------------------------------------------------------------- + # Amazon MQ alarms + # ----------------------------------------------------------------- + mq_cfg = alarm_cfg.get("mq", {}) + + if mq_broker is not None: + mq_queue_name = mq_cfg.get("queue_name", "olympia") + mq_vhost = mq_cfg.get("virtual_host", "/") + mq_msg_threshold = mq_cfg.get("message_ready_threshold", 1000) + mq_consumer_alarm_enabled = mq_cfg.get("consumer_alarm_enabled", False) + mq_consumer_threshold = mq_cfg.get("consumer_count_threshold", 1) + mq_cpu_threshold = mq_cfg.get("cpu_threshold", 80) + mq_mem_threshold = mq_cfg.get("memory_bytes_threshold", 512000000) + mq_period = mq_cfg.get("period", 300) + mq_eval_periods = mq_cfg.get("evaluation_periods", 2) + + # AWS publishes Amazon MQ for RabbitMQ metrics with the `Broker` + # dimension set to the broker name, not the broker ID. The Pulumi + # `aws.mq.Broker.id` output is the AWS broker UUID (e.g. + # b-xxxxxxxx-...) which would point at a non-existent metric + # series + broker_name = mq_broker.broker_name + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-mq-message-ready", + name=f"{project.name_prefix}-mq-message-ready", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={ + "Broker": broker_name, + "VirtualHost": mq_vhost, + "Queue": mq_queue_name, + }, + metric_name="MessageReadyCount", + namespace="AWS/AmazonMQ", + statistic="Average", + threshold=mq_msg_threshold, + period=mq_period, + evaluation_periods=mq_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Queue '{mq_queue_name}' has over {mq_msg_threshold} " + "ready messages. Check: is the worker consuming? " + "Are tasks backing up? Check worker logs." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, mq_broker]), + ) + + if mq_consumer_alarm_enabled: + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-mq-consumer-count", + name=f"{project.name_prefix}-mq-consumer-count", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="LessThanThreshold", + dimensions={ + "Broker": broker_name, + "VirtualHost": mq_vhost, + "Queue": mq_queue_name, + }, + metric_name="ConsumerCount", + namespace="AWS/AmazonMQ", + statistic="Minimum", + threshold=mq_consumer_threshold, + period=mq_period, + evaluation_periods=mq_eval_periods, + treat_missing_data="breaching", + alarm_description=( + f"No consumers connected to the '{mq_queue_name}' queue. " + "Check: is the worker service running? " + "Check worker logs for connection errors." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, mq_broker]), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-mq-cpu", + name=f"{project.name_prefix}-mq-cpu", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"Broker": broker_name}, + metric_name="SystemCpuUtilization", + namespace="AWS/AmazonMQ", + statistic="Average", + threshold=mq_cpu_threshold, + period=mq_period, + evaluation_periods=mq_eval_periods, + # Managed broker emits resource metrics whenever it is RUNNING; + # absence of data indicates the broker itself is in trouble. + treat_missing_data="breaching", + alarm_description=( + f"Broker CPU above {mq_cpu_threshold}%. Check: " + "queue depth, message throughput, consider " + "upgrading instance type if sustained." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, mq_broker]), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-mq-memory", + name=f"{project.name_prefix}-mq-memory", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"Broker": broker_name}, + metric_name="RabbitMQMemUsed", + namespace="AWS/AmazonMQ", + statistic="Average", + threshold=mq_mem_threshold, + period=mq_period, + evaluation_periods=mq_eval_periods, + # Same rationale as mq-cpu: missing data on a managed broker + # is itself a failure signal. + treat_missing_data="breaching", + alarm_description=( + f"Broker memory above {mq_mem_threshold} bytes. " + "Check: queue depth and message sizes, consider " + "purging stale queues or upgrading instance." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic, mq_broker]), + ) + + # ----------------------------------------------------------------- + # Redis alarms + # ----------------------------------------------------------------- + redis_cfg = alarm_cfg.get("redis", {}) + redis_cluster = elasticache_clusters.get("redis") + + if redis_cluster: + redis_mem_threshold = redis_cfg.get("memory_pct_threshold", 80) + redis_eviction_threshold = redis_cfg.get("eviction_threshold", 100) + redis_cpu_threshold = redis_cfg.get("cpu_threshold", 80) + redis_host_cpu_threshold = redis_cfg.get("host_cpu_threshold", 90) + redis_conn_threshold = redis_cfg.get("connection_threshold", 500) + redis_period = redis_cfg.get("period", 300) + redis_eval_periods = redis_cfg.get("evaluation_periods", 2) + + replication_group = redis_cluster.resources["replication_group"] + # ElastiCache publishes per-node metrics under the cache cluster ID. + # Use the provider's actual member_clusters output rather than + # reconstructing the AWS naming convention (`-001`); for our + # single-node replication group this resolves to the same value but + # is robust against multi-node setups and AWS naming changes + cache_cluster_id = replication_group.member_clusters.apply( + lambda clusters: clusters[0] + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-redis-memory", + name=f"{project.name_prefix}-redis-memory", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"CacheClusterId": cache_cluster_id}, + metric_name="DatabaseMemoryUsagePercentage", + namespace="AWS/ElastiCache", + statistic="Average", + threshold=redis_mem_threshold, + period=redis_period, + evaluation_periods=redis_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Redis memory usage above {redis_mem_threshold}%. " + "Check: eviction count, key count growth, " + "potential memory leak in application cache usage." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions( + depends_on=[alarm_topic, replication_group] + ), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-redis-evictions", + name=f"{project.name_prefix}-redis-evictions", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"CacheClusterId": cache_cluster_id}, + metric_name="Evictions", + namespace="AWS/ElastiCache", + statistic="Sum", + threshold=redis_eviction_threshold, + period=redis_period, + evaluation_periods=redis_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Redis evictions above {redis_eviction_threshold} " + "per period. Check: memory usage, maxmemory-policy, " + "whether the application is over-caching." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions( + depends_on=[alarm_topic, replication_group] + ), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-redis-cpu", + name=f"{project.name_prefix}-redis-cpu", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"CacheClusterId": cache_cluster_id}, + metric_name="EngineCPUUtilization", + namespace="AWS/ElastiCache", + statistic="Average", + threshold=redis_cpu_threshold, + period=redis_period, + evaluation_periods=redis_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Redis engine CPU above {redis_cpu_threshold}%. " + "Check: command complexity (KEYS, SORT), " + "connection count, consider node upgrade." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions( + depends_on=[alarm_topic, replication_group] + ), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-redis-connections", + name=f"{project.name_prefix}-redis-connections", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"CacheClusterId": cache_cluster_id}, + metric_name="CurrConnections", + namespace="AWS/ElastiCache", + statistic="Average", + threshold=redis_conn_threshold, + period=redis_period, + evaluation_periods=redis_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Redis connections above {redis_conn_threshold}. " + "Check: connection pool settings, task/service " + "count, potential connection leaks." + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions( + depends_on=[alarm_topic, replication_group] + ), + ) + + aws.cloudwatch.MetricAlarm( + f"{project.name_prefix}-redis-host-cpu", + name=f"{project.name_prefix}-redis-host-cpu", + alarm_actions=[alarm_topic.arn], + ok_actions=[alarm_topic.arn], + comparison_operator="GreaterThanOrEqualToThreshold", + dimensions={"CacheClusterId": cache_cluster_id}, + metric_name="CPUUtilization", + namespace="AWS/ElastiCache", + statistic="Average", + threshold=redis_host_cpu_threshold, + period=redis_period, + evaluation_periods=redis_eval_periods, + treat_missing_data="notBreaching", + alarm_description=( + f"Redis host CPU above {redis_host_cpu_threshold}%. " + "This monitors the underlying host, not just the Redis " + "engine. On nodes with <= 2 vCPUs, EngineCPUUtilization " + "alone can miss host overload. Check: background processes, " + "node type, consider upgrading" + ), + tags=project.common_tags, + opts=pulumi.ResourceOptions( + depends_on=[alarm_topic, replication_group] + ), + ) + + # ----------------------------------------------------------------- + # CloudWatch Dashboard + # ----------------------------------------------------------------- + dash_cfg = monitoring_cfg.get("dashboard", {}) + dash_period = dash_cfg.get("period", 300) + + dashboard_outputs = {} + for svc_name in ["web", "versioncheck"]: + fargate_svc = fargate_services.get(svc_name) + if fargate_svc: + svc_alb = fargate_svc.resources.get("fargate_service_alb") + if svc_alb: + alb = svc_alb.resources["albs"].get(svc_name) + tg = svc_alb.resources["target_groups"].get(svc_name) + if alb: + dashboard_outputs[f"{svc_name}_alb_suffix"] = alb.arn_suffix + if tg: + dashboard_outputs[f"{svc_name}_tg_suffix"] = tg.arn_suffix + svc_res = fargate_svc.resources.get("service") + cluster_res = fargate_svc.resources.get("cluster") + if svc_res: + dashboard_outputs[f"{svc_name}_svc_name"] = svc_res.name + if cluster_res: + dashboard_outputs[f"{svc_name}_cluster"] = cluster_res.arn.apply( + lambda arn: arn.split("/")[-1] + ) + + worker_svc = fargate_services.get("worker") + if worker_svc: + svc_res = worker_svc.resources.get("service") + cluster_res = worker_svc.resources.get("cluster") + if svc_res: + dashboard_outputs["worker_svc_name"] = svc_res.name + if cluster_res: + dashboard_outputs["worker_cluster"] = cluster_res.arn.apply( + lambda arn: arn.split("/")[-1] + ) + + if mq_broker is not None: + # Dashboard widgets feed this value into the `Broker` CloudWatch + # dimension, which AWS keys by broker name (not the b-... UUID) + dashboard_outputs["mq_broker_name"] = mq_broker.broker_name + + if redis_cluster: + dashboard_outputs["redis_cluster_id"] = redis_cluster.resources[ + "replication_group" + ].member_clusters.apply(lambda clusters: clusters[0]) + + mq_queue = alarm_cfg.get("mq", {}).get("queue_name", "olympia") + mq_vhost_dash = alarm_cfg.get("mq", {}).get("virtual_host", "/") + region = project.aws_region + + if dashboard_outputs: + dashboard_body = pulumi.Output.all(**dashboard_outputs).apply( + lambda o: _build_dashboard_body( + o, region, dash_period, mq_queue, mq_vhost_dash + ) + ) + + aws.cloudwatch.Dashboard( + f"{project.name_prefix}-dashboard", + dashboard_name=f"{project.name_prefix}-health", + dashboard_body=dashboard_body, + opts=pulumi.ResourceOptions(depends_on=[alarm_topic]), + ) + + # ----------------------------------------------------------------- + # Monitoring exports + # ----------------------------------------------------------------- + pulumi.export("monitoring_sns_topic_arn", alarm_topic.arn) + pulumi.export( + "monitoring_dashboard_name", + f"{project.name_prefix}-health", + ) + # ========================================================================= # ECS Scheduled Tasks (Cron Jobs) # ========================================================================= diff --git a/infra/pulumi/config.stage.yaml b/infra/pulumi/config.stage.yaml index fa40c65c9171..75c9a6d8ef3a 100644 --- a/infra/pulumi/config.stage.yaml +++ b/infra/pulumi/config.stage.yaml @@ -638,6 +638,52 @@ resources: # - efs_filesystem_id -- replaced by Pulumi-managed aws:efs:FileSystem # - mq_admin_password -- replaced by mq_credentials (JSON) + # ============================================================================= + # Monitoring and Alarms (prod-gating baseline) + # ============================================================================= + # Phase 1: ALB, target group, ECS service, Amazon MQ, Redis alarms + # with a single CloudWatch dashboard for is-stage-healthy? triage + # Phase 2 (future): EFS, log metric filters, deployment instability, + # external/shared resources (RDS, Memcached, OpenSearch) + monitoring: + notify_emails_secret_name: atn/stage/monitoring_notify_emails + alarms: + alb: + error_threshold: 10 + error_period: 60 + response_time_threshold: 1 + response_time_period: 60 + evaluation_periods: 2 + target_group: + unhealthy_threshold: 1 + period: 60 + evaluation_periods: 2 + ecs: + cpu_threshold: 80 + memory_threshold: 80 + period: 300 + evaluation_periods: 2 + mq: + message_ready_threshold: 1000 + consumer_alarm_enabled: false # worker desired_count is intentionally 0 in current stage posture + consumer_count_threshold: 1 + cpu_threshold: 80 + memory_bytes_threshold: 512000000 # ~512 MB (mq.t3.micro has 1 GB) + period: 300 + evaluation_periods: 2 + queue_name: olympia + virtual_host: / + redis: + memory_pct_threshold: 80 + eviction_threshold: 100 + cpu_threshold: 80 + host_cpu_threshold: 90 # cache.t3.small has 2 vCPUs; AWS recommends monitoring host CPUUtilization alongside EngineCPUUtilization on nodes with <= 2 vCPUs + connection_threshold: 500 + period: 300 + evaluation_periods: 2 + dashboard: + period: 300 + # ============================================================================= # Notes for implementation: # =============================================================================