feat(flywheel): Phase 2-2/2-4 — classify_alert_early + alertname/notification_type/alert_category 寫入

ADR-073 Phase 2-2: 早期分診,在 LLM 分析前決定 alert_category + notification_type
- webhooks.py: 新增 classify_alert_early() — 6 條規則覆蓋 config_drift/info/backup/infra/k8s/db/general
- webhooks.py: alertmanager_webhook 呼叫 classify_alert_early() 並傳入兩個 create_incident_for_approval() 呼叫點
- incident_service.py: create_incident_for_approval() 新增 notification_type/alert_category 參數,寫入 Incident model
- incident_repository.py: _incident_to_record_data() 新增 alertname/notification_type/alert_category 序列化
- db/models.py: IncidentRecord ORM 新增 alertname/notification_type/alert_category 三個 mapped_column

防止 HostBackupFailed 等告警被誤路由到 K8s executor (ADR-073 Phase 2-4 同步完成)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-12 14:31:07 +08:00
parent 59b7d8ea32
commit d4b8b1588b
4 changed files with 60 additions and 0 deletions

View File

@@ -951,6 +951,30 @@ class AlertmanagerPayload(BaseModel):
alerts: list[AlertmanagerAlert]
def classify_alert_early(alertname: str, severity: str, _labels: dict) -> tuple[str, str]:
"""
ADR-073 Phase 2-2: 早期分診,在 LLM 分析前決定 alert_category + notification_type。
2026-04-12 ogt: 防止 HostBackupFailed 等被誤路由到 K8s executor。
Returns:
tuple[str, str]: (alert_category, notification_type)
"""
alertname_lower = alertname.lower()
if alertname in ("ConfigurationDrift", "KubeConfigDrift"):
return "config_drift", "TYPE-4D"
if severity in ("info", "none"):
return "info", "TYPE-1"
if any(kw in alertname_lower for kw in ("backup", "heartbeat")):
return "backup", "TYPE-1"
if alertname.startswith(("Docker", "Host")):
return "infrastructure", "TYPE-3"
if alertname.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")):
return "kubernetes", "TYPE-3"
if alertname.startswith(("Postgres", "Redis")):
return "database", "TYPE-3"
return "general", "TYPE-3"
def is_internal_ip(client_ip: str) -> bool:
"""檢查是否為內網 IP"""
import ipaddress
@@ -1103,6 +1127,14 @@ async def alertmanager_webhook(
# I1 整合 ADR-064 Rule Engine 2026-04-11: YAML 規則動態推斷ALERTNAME_TO_TYPE 為 fallback
alert_type = get_incident_type(alertname)
# ADR-073 Phase 2-2: 早期分診 — 在 LLM 前決定 alert_category + notification_type
# 2026-04-12 ogt: 防止 HostBackupFailed 等被誤路由到 K8s executor
alert_category, notification_type = classify_alert_early(
alertname=alertname,
severity=alert.labels.get("severity", "warning"),
labels=alert.labels,
)
severity_map = {"critical": "critical", "warning": "warning", "info": "info"}
severity = severity_map.get(
alert.labels.get("severity", "warning").lower(),
@@ -1269,6 +1301,8 @@ async def alertmanager_webhook(
source="alertmanager",
alertname=alertname,
alert_labels=alert.labels, # Phase 1: 完整 labels 供 _extract_affected_services
notification_type=notification_type, # ADR-073 Phase 2-2
alert_category=alert_category, # ADR-073 Phase 2-2
)
# 2026-04-06 ogt: Phase 26 — 回寫 incident_id 到 Approval
@@ -1403,6 +1437,8 @@ async def alertmanager_webhook(
source="alertmanager",
alertname=alertname,
alert_labels=alert.labels, # Phase 1: 完整 labels
notification_type=notification_type, # ADR-073 Phase 2-2
alert_category=alert_category, # ADR-073 Phase 2-2
)
background_tasks.add_task(

View File

@@ -565,6 +565,23 @@ class IncidentRecord(Base):
comment="事件結果與人類回饋",
)
# === ADR-073 Phase 2 欄位 (2026-04-12 ogt) ===
alertname: Mapped[str | None] = mapped_column(
String(100),
nullable=True,
comment="告警名稱 (從 signals labels 抽取)",
)
notification_type: Mapped[str | None] = mapped_column(
String(10),
nullable=True,
comment="通知類型 TYPE-1/2/3/4/4D (早期分診)",
)
alert_category: Mapped[str | None] = mapped_column(
String(50),
nullable=True,
comment="告警類別 config_drift/info/backup/infrastructure/kubernetes/database/general",
)
# === 頻率快照 (Phase 27, 2026-04-10 ogt) ===
# frequency_stats 原本只存記憶體/Redis(TTL=35天)Pod重啟或超期即失
# 此欄位在 incident 建立時寫入快照,永久保存當時的頻率統計

View File

@@ -79,6 +79,9 @@ def _incident_to_record_data(incident: Incident) -> dict[str, Any]:
"resolved_at": incident.resolved_at,
"closed_at": incident.closed_at,
"frequency_snapshot": frequency_snapshot,
"alertname": incident.signals[0].alert_name if incident.signals else None, # ADR-073 Phase 2-1
"notification_type": incident.notification_type, # ADR-073 Phase 2-2
"alert_category": incident.alert_category, # ADR-073 Phase 2-2
}

View File

@@ -111,6 +111,8 @@ async def create_incident_for_approval(
source: str = "alertmanager",
alertname: str | None = None,
alert_labels: dict | None = None,
notification_type: str | None = None, # ADR-073 Phase 2-2
alert_category: str | None = None, # ADR-073 Phase 2-2
) -> str:
"""
為 Approval 創建對應的 Incident (活躍事件同步)。
@@ -151,6 +153,8 @@ async def create_incident_for_approval(
signals=[signal],
affected_services=_affected_services,
proposal_ids=[UUID(approval_id)],
notification_type=notification_type, # ADR-073 Phase 2-2
alert_category=alert_category, # ADR-073 Phase 2-2
)
await incident_service.save_to_working_memory(incident)