3131 alertsStore map [uint64 ]map [uint64 ][][]interface {} // [alert_id][server_id] -> 对应事件规则的检查结果
3232 alertsPrevState map [uint64 ]map [uint64 ]uint // [alert_id][server_id] -> 对应事件规则的上一次事件状态
3333 AlertsCycleTransferStatsStore map [uint64 ]* model.CycleTransferStats // [alert_id] -> 对应事件规则的周期流量统计
34+ serverLastOnlineTime map [uint64 ]time.Time // [server_id] -> 服务器离线前的最后在线时间(用于恢复通知计算离线时长)
3435)
3536
3637// addCycleTransferStatsInfo 向AlertsCycleTransferStatsStore中添加周期流量事件统计信息
@@ -72,6 +73,7 @@ func AlertSentinelStart() {
7273 alertsStore = make (map [uint64 ]map [uint64 ][][]interface {})
7374 alertsPrevState = make (map [uint64 ]map [uint64 ]uint )
7475 AlertsCycleTransferStatsStore = make (map [uint64 ]* model.CycleTransferStats )
76+ serverLastOnlineTime = make (map [uint64 ]time.Time )
7577 AlertsLock .Lock ()
7678 defer func () {
7779 if r := recover (); r != nil {
@@ -496,6 +498,20 @@ func checkStatus() {
496498 alertsPrevStateCopy [alert.ID ][server.ID ] = _RuleCheckFail
497499 log .Printf ("[事件]\n %s\n 规则:%s %s" , server .Name , alert .Name , * NotificationMuteLabel .ServerIncident (alert .ID , server .ID ))
498500
501+ // 记录最后在线时间(用于恢复通知计算真实离线时长)
502+ if isOfflineAlert {
503+ if _ , exists := serverLastOnlineTime [server .ID ]; ! exists {
504+ // 保存服务器离线前的最后在线时间
505+ if ! server .LastOnline .IsZero () {
506+ serverLastOnlineTime [server .ID ] = server .LastOnline
507+ } else if ! server .LastActive .IsZero () {
508+ serverLastOnlineTime [server .ID ] = server .LastActive
509+ } else {
510+ serverLastOnlineTime [server .ID ] = time .Now ()
511+ }
512+ }
513+ }
514+
499515 // 生成详细的通知消息
500516 message := generateDetailedAlertMessage (alert , server , alertsStoreCopy [alert.ID ][server.ID ])
501517
@@ -693,11 +709,8 @@ func formatBytes(bytes uint64) string {
693709func generateDetailedAlertMessage (alert * model.AlertRule , server * model.Server , checkResultsHistory [][]interface {}) string {
694710 now := time .Now ()
695711
696- // 基础通知信息
697- message := fmt .Sprintf ("#%s" + "\n " + "[%s]" + "\n " + "%s[%s]" + "\n " + "服务器ID: %d" + "\n " + "通知时间: %s" + "\n " ,
698- Localizer .MustLocalize (& i18n.LocalizeConfig {
699- MessageID : "Notify" ,
700- }),
712+ // 基础通知信息(移除了#探针通知前缀)
713+ message := fmt .Sprintf ("[%s]" + "\n " + "%s[%s]" + "\n " + "服务器ID: %d" + "\n " + "通知时间: %s" + "\n " ,
701714 Localizer .MustLocalize (& i18n.LocalizeConfig {
702715 MessageID : "Incident" ,
703716 }),
@@ -793,7 +806,7 @@ func generateDetailedAlertMessage(alert *model.AlertRule, server *model.Server,
793806 offlineDuration = time .Hour
794807 }
795808
796- message += fmt .Sprintf ("• 服务器离线: 最后在线时间 %s (离线时长 : %s) \n " ,
809+ message += fmt .Sprintf ("• 服务器离线: 最后在线时间 %s\n • 已离线时长 : %s\n " ,
797810 lastSeenTime .Format ("2006-01-02 15:04:05" ),
798811 formatDuration (offlineDuration ))
799812 default :
@@ -1058,11 +1071,8 @@ func cleanupAlertMemoryDataAsync() {
10581071func generateDetailedRecoveryMessage (alert * model.AlertRule , server * model.Server ) string {
10591072 now := time .Now ()
10601073
1061- // 基础恢复信息
1062- message := fmt .Sprintf ("#%s" + "\n " + "[%s]" + "\n " + "%s[%s]" + "\n " + "服务器ID: %d" + "\n " + "恢复时间: %s" + "\n " ,
1063- Localizer .MustLocalize (& i18n.LocalizeConfig {
1064- MessageID : "Notify" ,
1065- }),
1074+ // 基础恢复信息(移除了#探针通知前缀)
1075+ message := fmt .Sprintf ("[%s]" + "\n " + "%s[%s]" + "\n " + "服务器ID: %d" + "\n " + "恢复时间: %s" + "\n " ,
10661076 Localizer .MustLocalize (& i18n.LocalizeConfig {
10671077 MessageID : "Resolved" ,
10681078 }),
@@ -1083,30 +1093,30 @@ func generateDetailedRecoveryMessage(alert *model.AlertRule, server *model.Serve
10831093 }
10841094
10851095 if hasOfflineRule {
1086- // 修复恢复消息中的离线时长计算
1087- var lastSeenTime time.Time
1096+ // 使用记录的最后在线时间来计算真实的离线时长
1097+ var lastOnlineTime time.Time
10881098 var offlineDuration time.Duration
10891099
1090- // 优先使用LastOnline字段(这是服务器最后一次在线的准确时间)
1091- if ! server .LastOnline .IsZero () {
1092- lastSeenTime = server .LastOnline
1093- offlineDuration = now .Sub (lastSeenTime )
1094- } else if ! server .LastActive .IsZero () {
1095- // 如果没有LastOnline,使用LastActive,但需要考虑离线超时时间
1096- lastSeenTime = server .LastActive
1097- // 减去离线检测的超时时间(3分钟),得到更准确的离线时长
1098- offlineDuration = now .Sub (lastSeenTime ) - (3 * time .Minute )
1099- if offlineDuration < 0 {
1100- offlineDuration = now .Sub (lastSeenTime )
1101- }
1100+ // 优先使用记录的最后在线时间(与事件通知中显示的一致)
1101+ if recordedTime , exists := serverLastOnlineTime [server .ID ]; exists {
1102+ lastOnlineTime = recordedTime
1103+ offlineDuration = now .Sub (lastOnlineTime )
1104+ // 清除记录的最后在线时间
1105+ delete (serverLastOnlineTime , server .ID )
11021106 } else {
1103- // 如果都没有,说明服务器从未上线过
1104- lastSeenTime = now .Add (- time .Hour ) // 默认1小时前
1105- offlineDuration = time .Hour
1107+ // 如果没有记录,使用当前服务器的状态作为备选
1108+ if ! server .LastOnline .IsZero () {
1109+ lastOnlineTime = server .LastOnline
1110+ } else if ! server .LastActive .IsZero () {
1111+ lastOnlineTime = server .LastActive
1112+ } else {
1113+ lastOnlineTime = now .Add (- time .Hour )
1114+ }
1115+ offlineDuration = now .Sub (lastOnlineTime )
11061116 }
11071117
1108- message += fmt .Sprintf ("• 服务器已恢复上线: 上次离线时间 %s ( 离线时长: %s) \n " ,
1109- lastSeenTime .Format ("2006-01-02 15:04:05" ),
1118+ message += fmt .Sprintf ("• 服务器已恢复上线: 上次离线时间 %s\n • 离线时长: %s\n " ,
1119+ lastOnlineTime .Format ("2006-01-02 15:04:05" ),
11101120 formatDuration (offlineDuration ))
11111121 } else {
11121122 message += "• 服务器监控指标已恢复正常\n "
0 commit comments