diff --git a/apps/api/src/services/alert_analyzer_service.py b/apps/api/src/services/alert_analyzer_service.py index db319fea..8ab2f324 100644 --- a/apps/api/src/services/alert_analyzer_service.py +++ b/apps/api/src/services/alert_analyzer_service.py @@ -182,10 +182,14 @@ class AlertAnalyzer: """ 生成告警唯一指紋 (SHA256 Hash) - 指紋組成: namespace:deployment:alert_type:target_resource + 指紋組成: namespace:deployment:alertname:target_resource - 同一個告警模式(相同位置、相同類型)會產生相同指紋, - 用於識別重複告警並進行聚合。 + 使用 alertname(而非 alert_type)確保不同告警名稱不共用指紋。 + 原本用 alert_type 導致許多不同 alertname 都落入 "custom", + 造成同目標上的不同告警互相擋截(ADR-073 修復 2026-04-12 ogt)。 + + 同一個告警名稱 + 同一目標 → 相同指紋 → 觸發 debounce 去重。 + 不同告警名稱(即使同目標)→ 不同指紋 → 各自建立 Incident。 """ # 從 labels 取得 deployment,如果沒有則用 target_resource deployment = "" @@ -194,8 +198,13 @@ class AlertAnalyzer: if not deployment: deployment = alert.target_resource + # alertname 優先取 labels,fallback 到 alert_type(非 Alertmanager 來源) + alertname = ( + alert.labels.get("alertname", "") if alert.labels else "" + ) or alert.alert_type + # 組合指紋來源 - fingerprint_source = f"{alert.namespace}:{deployment}:{alert.alert_type}:{alert.target_resource}" + fingerprint_source = f"{alert.namespace}:{deployment}:{alertname}:{alert.target_resource}" # SHA256 Hash return hashlib.sha256(fingerprint_source.encode()).hexdigest()[:32] diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index 03b12962..e7aa1e58 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -130,11 +130,14 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No return "config_drift", "TYPE-4D" # 2. 告警鏈路健康(meta-monitoring,優先於 severity 判斷) + # 2026-04-12 ogt: 補入 NoAlertsReceived + PrometheusNotConnectedToAlertmanager if alertname in ( "AlertChainBroken_Alertmanager", "AlertChainBroken_Sentry", + "NoAlertsReceived", "NoAlertsReceived2Hours", "AlertChainUnhealthy", + "PrometheusNotConnectedToAlertmanager", ): return "alertchain_health", "TYPE-8M" @@ -171,8 +174,10 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No "HostBackupFailed", "HostBackupStale", "HostBackupMissing", "BackupRestoreTestFailed", "BackupRestoreTestStale", } + # 2026-04-12 ogt: 補入 DeadMansSwitch(HEARTBEAT_ALERT_NAMES 中但之前漏掉) if ( "watchdog" in alertname_lower + or "deadmansswitch" in alertname_lower or alertname == "Heartbeat" or alertname in _BACKUP_TYPE1_NAMES or alertname.startswith("HostBackup") diff --git a/apps/api/tests/test_classify_alert_early.py b/apps/api/tests/test_classify_alert_early.py index d1ede5b5..e1008ebe 100644 --- a/apps/api/tests/test_classify_alert_early.py +++ b/apps/api/tests/test_classify_alert_early.py @@ -66,6 +66,12 @@ class TestInfoAlerts: ac, nt = classify_alert_early("Watchdog", "none", {}) assert nt == "TYPE-1" + def test_deadmansswitch_heartbeat(self): + # DeadMansSwitch 心跳 → TYPE-1(補入 2026-04-12 ogt) + ac, nt = classify_alert_early("DeadMansSwitch", "warning", {}) + assert ac == "backup" + assert nt == "TYPE-1" + def test_backup_critical_not_type1(self): # critical backup 告警應走各自 prefix,不是純資訊 ac, nt = classify_alert_early("BACKUP_MISSING", "critical", {}) @@ -115,6 +121,8 @@ class TestAlertchainHealth: "AlertChainBroken_Sentry", "NoAlertsReceived2Hours", "AlertChainUnhealthy", + "NoAlertsReceived", + "PrometheusNotConnectedToAlertmanager", ]) def test_alertchain_alerts(self, alertname): ac, nt = classify_alert_early(alertname, "critical", {})