From d4b8b1588b7bc787040ef635300860a11588a39f Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 12 Apr 2026 14:31:07 +0800 Subject: [PATCH] =?UTF-8?q?feat(flywheel):=20Phase=202-2/2-4=20=E2=80=94?= =?UTF-8?q?=20classify=5Falert=5Fearly=20+=20alertname/notification=5Ftype?= =?UTF-8?q?/alert=5Fcategory=20=E5=AF=AB=E5=85=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ADR-073 Phase 2-2: 早期分診,在 LLM 分析前決定 alert_category + notification_type - webhooks.py: 新增 classify_alert_early() — 6 條規則覆蓋 config_drift/info/backup/infra/k8s/db/general - webhooks.py: alertmanager_webhook 呼叫 classify_alert_early() 並傳入兩個 create_incident_for_approval() 呼叫點 - incident_service.py: create_incident_for_approval() 新增 notification_type/alert_category 參數,寫入 Incident model - incident_repository.py: _incident_to_record_data() 新增 alertname/notification_type/alert_category 序列化 - db/models.py: IncidentRecord ORM 新增 alertname/notification_type/alert_category 三個 mapped_column 防止 HostBackupFailed 等告警被誤路由到 K8s executor (ADR-073 Phase 2-4 同步完成) Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/api/v1/webhooks.py | 36 +++++++++++++++++++ apps/api/src/db/models.py | 17 +++++++++ .../src/repositories/incident_repository.py | 3 ++ apps/api/src/services/incident_service.py | 4 +++ 4 files changed, 60 insertions(+) diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 556492ce..7b031d8c 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -951,6 +951,30 @@ class AlertmanagerPayload(BaseModel): alerts: list[AlertmanagerAlert] +def classify_alert_early(alertname: str, severity: str, _labels: dict) -> tuple[str, str]: + """ + ADR-073 Phase 2-2: 早期分診,在 LLM 分析前決定 alert_category + notification_type。 + 2026-04-12 ogt: 防止 HostBackupFailed 等被誤路由到 K8s executor。 + + Returns: + tuple[str, str]: (alert_category, notification_type) + """ + alertname_lower = alertname.lower() + if alertname in ("ConfigurationDrift", "KubeConfigDrift"): + return "config_drift", "TYPE-4D" + if severity in ("info", "none"): + return "info", "TYPE-1" + if any(kw in alertname_lower for kw in ("backup", "heartbeat")): + return "backup", "TYPE-1" + if alertname.startswith(("Docker", "Host")): + return "infrastructure", "TYPE-3" + if alertname.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")): + return "kubernetes", "TYPE-3" + if alertname.startswith(("Postgres", "Redis")): + return "database", "TYPE-3" + return "general", "TYPE-3" + + def is_internal_ip(client_ip: str) -> bool: """檢查是否為內網 IP""" import ipaddress @@ -1103,6 +1127,14 @@ async def alertmanager_webhook( # I1 整合 ADR-064 Rule Engine 2026-04-11: YAML 規則動態推斷,ALERTNAME_TO_TYPE 為 fallback alert_type = get_incident_type(alertname) + # ADR-073 Phase 2-2: 早期分診 — 在 LLM 前決定 alert_category + notification_type + # 2026-04-12 ogt: 防止 HostBackupFailed 等被誤路由到 K8s executor + alert_category, notification_type = classify_alert_early( + alertname=alertname, + severity=alert.labels.get("severity", "warning"), + labels=alert.labels, + ) + severity_map = {"critical": "critical", "warning": "warning", "info": "info"} severity = severity_map.get( alert.labels.get("severity", "warning").lower(), @@ -1269,6 +1301,8 @@ async def alertmanager_webhook( source="alertmanager", alertname=alertname, alert_labels=alert.labels, # Phase 1: 完整 labels 供 _extract_affected_services + notification_type=notification_type, # ADR-073 Phase 2-2 + alert_category=alert_category, # ADR-073 Phase 2-2 ) # 2026-04-06 ogt: Phase 26 — 回寫 incident_id 到 Approval @@ -1403,6 +1437,8 @@ async def alertmanager_webhook( source="alertmanager", alertname=alertname, alert_labels=alert.labels, # Phase 1: 完整 labels + notification_type=notification_type, # ADR-073 Phase 2-2 + alert_category=alert_category, # ADR-073 Phase 2-2 ) background_tasks.add_task( diff --git a/apps/api/src/db/models.py b/apps/api/src/db/models.py index 6faabe5d..12f8a87e 100644 --- a/apps/api/src/db/models.py +++ b/apps/api/src/db/models.py @@ -565,6 +565,23 @@ class IncidentRecord(Base): comment="事件結果與人類回饋", ) + # === ADR-073 Phase 2 欄位 (2026-04-12 ogt) === + alertname: Mapped[str | None] = mapped_column( + String(100), + nullable=True, + comment="告警名稱 (從 signals labels 抽取)", + ) + notification_type: Mapped[str | None] = mapped_column( + String(10), + nullable=True, + comment="通知類型 TYPE-1/2/3/4/4D (早期分診)", + ) + alert_category: Mapped[str | None] = mapped_column( + String(50), + nullable=True, + comment="告警類別 config_drift/info/backup/infrastructure/kubernetes/database/general", + ) + # === 頻率快照 (Phase 27, 2026-04-10 ogt) === # frequency_stats 原本只存記憶體/Redis(TTL=35天),Pod重啟或超期即失 # 此欄位在 incident 建立時寫入快照,永久保存當時的頻率統計 diff --git a/apps/api/src/repositories/incident_repository.py b/apps/api/src/repositories/incident_repository.py index 6ff69bb5..3e3558cd 100644 --- a/apps/api/src/repositories/incident_repository.py +++ b/apps/api/src/repositories/incident_repository.py @@ -79,6 +79,9 @@ def _incident_to_record_data(incident: Incident) -> dict[str, Any]: "resolved_at": incident.resolved_at, "closed_at": incident.closed_at, "frequency_snapshot": frequency_snapshot, + "alertname": incident.signals[0].alert_name if incident.signals else None, # ADR-073 Phase 2-1 + "notification_type": incident.notification_type, # ADR-073 Phase 2-2 + "alert_category": incident.alert_category, # ADR-073 Phase 2-2 } diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index 2bd01ed0..db2461ab 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -111,6 +111,8 @@ async def create_incident_for_approval( source: str = "alertmanager", alertname: str | None = None, alert_labels: dict | None = None, + notification_type: str | None = None, # ADR-073 Phase 2-2 + alert_category: str | None = None, # ADR-073 Phase 2-2 ) -> str: """ 為 Approval 創建對應的 Incident (活躍事件同步)。 @@ -151,6 +153,8 @@ async def create_incident_for_approval( signals=[signal], affected_services=_affected_services, proposal_ids=[UUID(approval_id)], + notification_type=notification_type, # ADR-073 Phase 2-2 + alert_category=alert_category, # ADR-073 Phase 2-2 ) await incident_service.save_to_working_memory(incident)