Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions mu-qa.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ policy-arns = [

[deployed-env]
CRITIC_NAMESPACE = 'qa'
MAILGUN_API_KEY = 'op://critic/mailgun/password'
MAILGUN_DOMAIN = 'mg.level12.io'
MAILGUN_FROM = 'critic@level12.io'
SLACK_BOT_TOKEN = 'op://critic/slack_bot_token/password'

[event-rules.run-due-checks]
action='run_due_checks'
Expand Down
78 changes: 78 additions & 0 deletions src/critic/alerts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import logging

from critic.libs.mailgun import send_email
from critic.libs.slack import post_message, post_webhook
from critic.models import MonitorState, UptimeMonitorModel


log = logging.getLogger(__name__)


def _monitor_url(monitor: UptimeMonitorModel) -> str:
return str(monitor.url)


def _monitor_label(monitor: UptimeMonitorModel) -> str:
return f'{monitor.project_id}/{monitor.slug}'


def _send_slack(monitor: UptimeMonitorModel, text: str) -> None:
for dest in monitor.alert_slack_channels:
try:
# If it looks like a webhook, use webhook mode otherwise treat as channel id/name.
if dest.startswith(('http://', 'https://')):
post_webhook(dest, text)
else:
post_message(dest, text)
except Exception as e:
log.exception(f'Failed to send Slack alert to {dest}: {e}')


def _send_email(monitor: UptimeMonitorModel, subject: str, text: str) -> None:
for email in monitor.alert_emails:
try:
send_email(email, subject, text)
except Exception as e:
log.exception(f'Failed to send email alert to {email}: {e}')


def maybe_send_alerts(
*,
monitor: UptimeMonitorModel,
prev_state: MonitorState,
prev_consecutive_fails: int,
) -> None:
# Decide whether to send alerts based on state transitions and fail thresholds.
if monitor.state == MonitorState.paused:
return

label = _monitor_label(monitor)
url = _monitor_url(monitor)

# Recovery
if prev_state == MonitorState.down and monitor.state == MonitorState.up:
subject = f'CRITIC RECOVERY: {label}'
text = f'Recovered: {label}\nURL: {url}'
log.info(f'Sending recovery alert for {label}')
_send_slack(monitor, text)
_send_email(monitor, subject, text)
return

# Down alert
if (
monitor.state == MonitorState.down
and monitor.consecutive_fails >= monitor.failures_before_alerting
):
crossed_threshold = prev_consecutive_fails < monitor.failures_before_alerting
became_down = prev_state != MonitorState.down
if crossed_threshold or became_down:
subject = f'CRITIC DOWN: {label}'
text = (
f'Down: {label}\n'
f'URL: {url}\n'
f'Consecutive fails: {monitor.consecutive_fails} '
f'(threshold: {monitor.failures_before_alerting})'
)
log.info(f'Sending down alert for {label}')
_send_slack(monitor, text)
_send_email(monitor, subject, text)
8 changes: 7 additions & 1 deletion src/critic/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

import click

from critic.models import UptimeMonitorModel
from critic.libs.assertions import Assertion
from critic.models import MonitorState, UptimeMonitorModel
from critic.tables import ProjectTable, UptimeMonitorTable


Expand Down Expand Up @@ -41,6 +42,11 @@ def put_fake_monitors(project_id: UUID, count: int):
project_id=project_id,
slug=str(i),
url='https://google.com',
failures_before_alerting=1,
alert_slack_channels=['C09D3TDEB9B'],
alert_emails=['critic@level12.io'],
assertions=[Assertion(assertion_string='status_code == 301')],
state=MonitorState.down,
)
UptimeMonitorTable.put(monitor)
click.echo(f' Put monitor: {project_id}/{i}')
Expand Down
36 changes: 36 additions & 0 deletions src/critic/libs/mailgun.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os

import httpx


class MailgunError(Exception):
pass


def send_email(to_email: str, subject: str, text: str) -> None:
# Send an email via Mailgun HTTP API.
api_key = os.environ.get('MAILGUN_API_KEY')
domain = os.environ.get('MAILGUN_DOMAIN')
mail_from = os.environ.get('MAILGUN_FROM')

if not api_key:
raise MailgunError('Missing MAILGUN_API_KEY')
if not domain:
raise MailgunError('Missing MAILGUN_DOMAIN')
if not mail_from:
raise MailgunError('Missing MAILGUN_FROM')
if not to_email:
raise MailgunError('Missing recipient email')

resp = httpx.post(
f'https://api.mailgun.net/v3/{domain}/messages',
auth=('api', api_key),
data={
'from': mail_from,
'to': to_email,
'subject': subject,
'text': text,
},
timeout=10,
)
resp.raise_for_status()
38 changes: 38 additions & 0 deletions src/critic/libs/slack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os

import httpx


class SlackError(Exception):
pass


def post_webhook(webhook_url: str, text: str) -> None:
# Send a Slack message via Incoming Webhook URL.
if not webhook_url:
raise SlackError('Missing webhook_url')

resp = httpx.post(
webhook_url,
json={'text': text},
timeout=10,
)
resp.raise_for_status()


def post_message(channel: str, text: str) -> None:
# Send a Slack message using chat.postMessage (requires SLACK_BOT_TOKEN).
token = os.environ.get('SLACK_BOT_TOKEN')
if not token:
raise SlackError('Missing SLACK_BOT_TOKEN')

resp = httpx.post(
'https://slack.com/api/chat.postMessage',
headers={'Authorization': f'Bearer {token}'},
json={'channel': channel, 'text': text},
timeout=10,
)
resp.raise_for_status()
data = resp.json()
if not data.get('ok'):
raise SlackError(f'Slack API error: {data.get("error")}')
33 changes: 26 additions & 7 deletions src/critic/libs/uptime.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import httpx

from critic.alerts import maybe_send_alerts
from critic.libs.dt import round_minute
from critic.models import MonitorState, UptimeLogModel, UptimeMonitorModel
from critic.tables import UptimeLogTable, UptimeMonitorTable
Expand Down Expand Up @@ -90,8 +91,12 @@ def make_req(self) -> tuple[httpx.Response | None, float]:
latency = response.elapsed.total_seconds() * 1000 if response else (finished - start)
return response, latency

def alert(self):
"""TODO: alert self.monitor.alert_slack_channels and self.monitor.alert_emails."""
def alert(self, prev_state: MonitorState, prev_consecutive_fails: int):
maybe_send_alerts(
monitor=self.monitor,
prev_state=prev_state,
prev_consecutive_fails=prev_consecutive_fails,
)

def check_resp(self, response: httpx.Response | None) -> tuple[MonitorState, int, list[str]]:
"""Checks the response and returns the new state and consecutive fails and list of error
Expand All @@ -115,12 +120,15 @@ def check_resp(self, response: httpx.Response | None) -> tuple[MonitorState, int
error_messages.append('Connection Timeout')

consecutive_fails = 0 if state == MonitorState.up else self.monitor.consecutive_fails + 1
if consecutive_fails >= self.monitor.failures_before_alerting:
self.alert()

return state, consecutive_fails, error_messages

def put_log(
self, state: MonitorState, status_code: int, latency: float, error_message: str | None
self,
state: MonitorState,
status_code: int,
latency: float,
error_messages: list[str],
):
"""
Puts a log for the check. This method should only be called once per monitor check.
Expand All @@ -133,7 +141,7 @@ def put_log(
status=state,
resp_code=status_code,
latency_secs=latency,
error_message=error_message,
error_message=error_messages if error_messages else None,
)
UptimeLogTable.put(uptime_log)
self._put_log = True
Expand All @@ -152,6 +160,10 @@ def run(self):
self.update_monitor()
return

# Capture previous values for alert logic
prev_state = self.monitor.state
prev_consecutive_fails = self.monitor.consecutive_fails

# Make the request
resp, latency = self.make_req()

Expand All @@ -163,9 +175,16 @@ def run(self):

# Save a log
if updated:
# Keep the in-memory monitor consistent for alert formatting/logic
self.monitor.state = state
self.monitor.consecutive_fails = consecutive_fails

# Alerts (only if update succeeded to prevent duplicates on race conditions)
self.alert(prev_state, prev_consecutive_fails)

self.put_log(
state,
resp.status_code if resp else 0,
latency,
error_messages if error_messages else None,
error_messages,
)
Loading