From 1a4b52ed285323630b4f74e226b86da1db2fbce3 Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 12 Apr 2026 22:50:20 +0800 Subject: [PATCH] =?UTF-8?q?fix(alert):=20fingerprint=20=E5=8A=A0=20alertna?= =?UTF-8?q?me=20=E9=98=B2=E8=B7=A8=E5=91=8A=E8=AD=A6=E6=8C=87=E7=B4=8B?= =?UTF-8?q?=E8=A1=9D=E7=AA=81=20+=20=E8=A3=9C=E5=85=A5=E7=BC=BA=E6=BC=8F?= =?UTF-8?q?=E5=BF=83=E8=B7=B3=E5=88=86=E9=A1=9E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 問題根因: 1. generate_fingerprint 用 alert_type(大量 alertname 落入 "custom") → 不同告警名稱同目標共用指紋 → 30 分鐘 debounce 互相擋截 2. classify_alert_early 漏掉 DeadMansSwitch / NoAlertsReceived / PrometheusNotConnectedToAlertmanager → 落入 TYPE-3 一般告警 修復: - alert_analyzer_service.py: 指紋改為 namespace:deployment:alertname:target_resource alertname 取自 labels(Alertmanager),fallback 到 alert_type(其他來源) - incident_service.py: DeadMansSwitch → backup/TYPE-1; NoAlertsReceived + PrometheusNotConnectedToAlertmanager → alertchain_health/TYPE-8M - 補 2 個測試,全套 627 passed Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/alert_analyzer_service.py | 17 +++++++++++++---- apps/api/src/services/incident_service.py | 5 +++++ apps/api/tests/test_classify_alert_early.py | 8 ++++++++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/apps/api/src/services/alert_analyzer_service.py b/apps/api/src/services/alert_analyzer_service.py index db319fea..8ab2f324 100644 --- a/apps/api/src/services/alert_analyzer_service.py +++ b/apps/api/src/services/alert_analyzer_service.py @@ -182,10 +182,14 @@ class AlertAnalyzer: """ 生成告警唯一指紋 (SHA256 Hash) - 指紋組成: namespace:deployment:alert_type:target_resource + 指紋組成: namespace:deployment:alertname:target_resource - 同一個告警模式(相同位置、相同類型)會產生相同指紋, - 用於識別重複告警並進行聚合。 + 使用 alertname(而非 alert_type)確保不同告警名稱不共用指紋。 + 原本用 alert_type 導致許多不同 alertname 都落入 "custom", + 造成同目標上的不同告警互相擋截(ADR-073 修復 2026-04-12 ogt)。 + + 同一個告警名稱 + 同一目標 → 相同指紋 → 觸發 debounce 去重。 + 不同告警名稱(即使同目標)→ 不同指紋 → 各自建立 Incident。 """ # 從 labels 取得 deployment,如果沒有則用 target_resource deployment = "" @@ -194,8 +198,13 @@ class AlertAnalyzer: if not deployment: deployment = alert.target_resource + # alertname 優先取 labels,fallback 到 alert_type(非 Alertmanager 來源) + alertname = ( + alert.labels.get("alertname", "") if alert.labels else "" + ) or alert.alert_type + # 組合指紋來源 - fingerprint_source = f"{alert.namespace}:{deployment}:{alert.alert_type}:{alert.target_resource}" + fingerprint_source = f"{alert.namespace}:{deployment}:{alertname}:{alert.target_resource}" # SHA256 Hash return hashlib.sha256(fingerprint_source.encode()).hexdigest()[:32] diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index 03b12962..e7aa1e58 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -130,11 +130,14 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No return "config_drift", "TYPE-4D" # 2. 告警鏈路健康(meta-monitoring,優先於 severity 判斷) + # 2026-04-12 ogt: 補入 NoAlertsReceived + PrometheusNotConnectedToAlertmanager if alertname in ( "AlertChainBroken_Alertmanager", "AlertChainBroken_Sentry", + "NoAlertsReceived", "NoAlertsReceived2Hours", "AlertChainUnhealthy", + "PrometheusNotConnectedToAlertmanager", ): return "alertchain_health", "TYPE-8M" @@ -171,8 +174,10 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No "HostBackupFailed", "HostBackupStale", "HostBackupMissing", "BackupRestoreTestFailed", "BackupRestoreTestStale", } + # 2026-04-12 ogt: 補入 DeadMansSwitch(HEARTBEAT_ALERT_NAMES 中但之前漏掉) if ( "watchdog" in alertname_lower + or "deadmansswitch" in alertname_lower or alertname == "Heartbeat" or alertname in _BACKUP_TYPE1_NAMES or alertname.startswith("HostBackup") diff --git a/apps/api/tests/test_classify_alert_early.py b/apps/api/tests/test_classify_alert_early.py index d1ede5b5..e1008ebe 100644 --- a/apps/api/tests/test_classify_alert_early.py +++ b/apps/api/tests/test_classify_alert_early.py @@ -66,6 +66,12 @@ class TestInfoAlerts: ac, nt = classify_alert_early("Watchdog", "none", {}) assert nt == "TYPE-1" + def test_deadmansswitch_heartbeat(self): + # DeadMansSwitch 心跳 → TYPE-1(補入 2026-04-12 ogt) + ac, nt = classify_alert_early("DeadMansSwitch", "warning", {}) + assert ac == "backup" + assert nt == "TYPE-1" + def test_backup_critical_not_type1(self): # critical backup 告警應走各自 prefix,不是純資訊 ac, nt = classify_alert_early("BACKUP_MISSING", "critical", {}) @@ -115,6 +121,8 @@ class TestAlertchainHealth: "AlertChainBroken_Sentry", "NoAlertsReceived2Hours", "AlertChainUnhealthy", + "NoAlertsReceived", + "PrometheusNotConnectedToAlertmanager", ]) def test_alertchain_alerts(self, alertname): ac, nt = classify_alert_early(alertname, "critical", {})