fix(classify): backup/heartbeat severity=warning/critical 告警恢復告警卡片格式
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 2m38s

根因:classify_alert_early() backup 規則無 severity 條件,導致
VeleroBackupFailed / HostBackupFailed (warning/critical) 被分為 TYPE-1
(純資訊無按鈕),告警卡片格式遺失。

修復:
- backup/heartbeat 關鍵字只在 severity=info/none 才命中 TYPE-1
- severity=warning/critical 的 backup 告警走正確 prefix 規則
  (Velero→kubernetes TYPE-3, HostBackup→infrastructure TYPE-3)
- Watchdog (severity=none) 由 severity 規則先命中,維持 TYPE-1
- 補強測試:25 cases,含 VeleroBackupFailed critical → kubernetes TYPE-3

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-12 15:24:00 +08:00
parent e770813b6b
commit 1074936e54
2 changed files with 28 additions and 13 deletions

View File

@@ -126,7 +126,12 @@ def classify_alert_early(alertname: str, severity: str, _labels: dict) -> tuple[
return "config_drift", "TYPE-4D"
if severity in ("info", "none"):
return "info", "TYPE-1"
if any(kw in alertname_lower for kw in ("backup", "heartbeat")):
# backup/heartbeat 關鍵字只有 severity=info/none 才是純資訊
# severity=warning/critical例如 VeleroBackupFailed, HostBackupFailed→ 繼續走 prefix 規則
if severity in ("info", "none") and any(kw in alertname_lower for kw in ("backup", "heartbeat")):
return "backup", "TYPE-1"
# Watchdog/Heartbeat 永遠是 TYPE-1Alertmanager 心跳)
if "watchdog" in alertname_lower or alertname in ("Heartbeat",):
return "backup", "TYPE-1"
if alertname.startswith(("Docker", "Host")):
return "infrastructure", "TYPE-3"

View File

@@ -47,20 +47,25 @@ class TestInfoAlerts:
assert nt == "TYPE-1"
assert ac == "info"
def test_backup_keyword(self):
def test_backup_keyword_info_only(self):
# severity=info → severity 規則先命中TYPE-1
ac, nt = classify_alert_early("BackupJobComplete", "info", {})
assert nt == "TYPE-1"
def test_backup_keyword_warning_not_type1(self):
# BackupJobFailed severity=warning → 繼續走 prefix 規則,不應是 TYPE-1
ac, nt = classify_alert_early("BackupJobFailed", "warning", {})
assert nt == "TYPE-1"
assert ac == "backup"
assert nt == "TYPE-3"
def test_heartbeat_keyword(self):
ac, nt = classify_alert_early("WatchdogHeartbeat", "warning", {})
def test_watchdog_heartbeat(self):
# Watchdog (Alertmanager 心跳) severity=none → severity 規則先命中TYPE-1
ac, nt = classify_alert_early("Watchdog", "none", {})
assert nt == "TYPE-1"
assert ac == "backup"
def test_backup_case_insensitive(self):
def test_backup_critical_not_type1(self):
# critical backup 告警應走各自 prefix不是純資訊
ac, nt = classify_alert_early("BACKUP_MISSING", "critical", {})
assert nt == "TYPE-1"
assert ac == "backup"
assert nt == "TYPE-3"
# --------------------------------------------------------------------------- #
@@ -100,11 +105,16 @@ class TestKubernetes:
assert nt == "TYPE-3"
assert ac == "kubernetes"
def test_velero_backup_keyword_wins(self):
# VeleroBackupFailed 含 "backup" → backup 規則優先於 kubernetes prefix
def test_velero_backup_failed_is_kubernetes(self):
# VeleroBackupFailed severity=critical → backup 規則不命中,走 Velero prefix → kubernetes TYPE-3
ac, nt = classify_alert_early("VeleroBackupFailed", "critical", {})
assert nt == "TYPE-3"
assert ac == "kubernetes"
def test_velero_backup_success_info_is_type1(self):
# VeleroBackupSuccess severity=info → TYPE-1
ac, nt = classify_alert_early("VeleroBackupSuccess", "info", {})
assert nt == "TYPE-1"
assert ac == "backup"
# --------------------------------------------------------------------------- #