feat(flywheel): Phase 2-2/2-4 — classify_alert_early + alertname/notification_type/alert_category 寫入
ADR-073 Phase 2-2: 早期分診,在 LLM 分析前決定 alert_category + notification_type - webhooks.py: 新增 classify_alert_early() — 6 條規則覆蓋 config_drift/info/backup/infra/k8s/db/general - webhooks.py: alertmanager_webhook 呼叫 classify_alert_early() 並傳入兩個 create_incident_for_approval() 呼叫點 - incident_service.py: create_incident_for_approval() 新增 notification_type/alert_category 參數,寫入 Incident model - incident_repository.py: _incident_to_record_data() 新增 alertname/notification_type/alert_category 序列化 - db/models.py: IncidentRecord ORM 新增 alertname/notification_type/alert_category 三個 mapped_column 防止 HostBackupFailed 等告警被誤路由到 K8s executor (ADR-073 Phase 2-4 同步完成) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -951,6 +951,30 @@ class AlertmanagerPayload(BaseModel):
|
||||
alerts: list[AlertmanagerAlert]
|
||||
|
||||
|
||||
def classify_alert_early(alertname: str, severity: str, _labels: dict) -> tuple[str, str]:
|
||||
"""
|
||||
ADR-073 Phase 2-2: 早期分診,在 LLM 分析前決定 alert_category + notification_type。
|
||||
2026-04-12 ogt: 防止 HostBackupFailed 等被誤路由到 K8s executor。
|
||||
|
||||
Returns:
|
||||
tuple[str, str]: (alert_category, notification_type)
|
||||
"""
|
||||
alertname_lower = alertname.lower()
|
||||
if alertname in ("ConfigurationDrift", "KubeConfigDrift"):
|
||||
return "config_drift", "TYPE-4D"
|
||||
if severity in ("info", "none"):
|
||||
return "info", "TYPE-1"
|
||||
if any(kw in alertname_lower for kw in ("backup", "heartbeat")):
|
||||
return "backup", "TYPE-1"
|
||||
if alertname.startswith(("Docker", "Host")):
|
||||
return "infrastructure", "TYPE-3"
|
||||
if alertname.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")):
|
||||
return "kubernetes", "TYPE-3"
|
||||
if alertname.startswith(("Postgres", "Redis")):
|
||||
return "database", "TYPE-3"
|
||||
return "general", "TYPE-3"
|
||||
|
||||
|
||||
def is_internal_ip(client_ip: str) -> bool:
|
||||
"""檢查是否為內網 IP"""
|
||||
import ipaddress
|
||||
@@ -1103,6 +1127,14 @@ async def alertmanager_webhook(
|
||||
# I1 整合 ADR-064 Rule Engine 2026-04-11: YAML 規則動態推斷,ALERTNAME_TO_TYPE 為 fallback
|
||||
alert_type = get_incident_type(alertname)
|
||||
|
||||
# ADR-073 Phase 2-2: 早期分診 — 在 LLM 前決定 alert_category + notification_type
|
||||
# 2026-04-12 ogt: 防止 HostBackupFailed 等被誤路由到 K8s executor
|
||||
alert_category, notification_type = classify_alert_early(
|
||||
alertname=alertname,
|
||||
severity=alert.labels.get("severity", "warning"),
|
||||
labels=alert.labels,
|
||||
)
|
||||
|
||||
severity_map = {"critical": "critical", "warning": "warning", "info": "info"}
|
||||
severity = severity_map.get(
|
||||
alert.labels.get("severity", "warning").lower(),
|
||||
@@ -1269,6 +1301,8 @@ async def alertmanager_webhook(
|
||||
source="alertmanager",
|
||||
alertname=alertname,
|
||||
alert_labels=alert.labels, # Phase 1: 完整 labels 供 _extract_affected_services
|
||||
notification_type=notification_type, # ADR-073 Phase 2-2
|
||||
alert_category=alert_category, # ADR-073 Phase 2-2
|
||||
)
|
||||
|
||||
# 2026-04-06 ogt: Phase 26 — 回寫 incident_id 到 Approval
|
||||
@@ -1403,6 +1437,8 @@ async def alertmanager_webhook(
|
||||
source="alertmanager",
|
||||
alertname=alertname,
|
||||
alert_labels=alert.labels, # Phase 1: 完整 labels
|
||||
notification_type=notification_type, # ADR-073 Phase 2-2
|
||||
alert_category=alert_category, # ADR-073 Phase 2-2
|
||||
)
|
||||
|
||||
background_tasks.add_task(
|
||||
|
||||
@@ -565,6 +565,23 @@ class IncidentRecord(Base):
|
||||
comment="事件結果與人類回饋",
|
||||
)
|
||||
|
||||
# === ADR-073 Phase 2 欄位 (2026-04-12 ogt) ===
|
||||
alertname: Mapped[str | None] = mapped_column(
|
||||
String(100),
|
||||
nullable=True,
|
||||
comment="告警名稱 (從 signals labels 抽取)",
|
||||
)
|
||||
notification_type: Mapped[str | None] = mapped_column(
|
||||
String(10),
|
||||
nullable=True,
|
||||
comment="通知類型 TYPE-1/2/3/4/4D (早期分診)",
|
||||
)
|
||||
alert_category: Mapped[str | None] = mapped_column(
|
||||
String(50),
|
||||
nullable=True,
|
||||
comment="告警類別 config_drift/info/backup/infrastructure/kubernetes/database/general",
|
||||
)
|
||||
|
||||
# === 頻率快照 (Phase 27, 2026-04-10 ogt) ===
|
||||
# frequency_stats 原本只存記憶體/Redis(TTL=35天),Pod重啟或超期即失
|
||||
# 此欄位在 incident 建立時寫入快照,永久保存當時的頻率統計
|
||||
|
||||
@@ -79,6 +79,9 @@ def _incident_to_record_data(incident: Incident) -> dict[str, Any]:
|
||||
"resolved_at": incident.resolved_at,
|
||||
"closed_at": incident.closed_at,
|
||||
"frequency_snapshot": frequency_snapshot,
|
||||
"alertname": incident.signals[0].alert_name if incident.signals else None, # ADR-073 Phase 2-1
|
||||
"notification_type": incident.notification_type, # ADR-073 Phase 2-2
|
||||
"alert_category": incident.alert_category, # ADR-073 Phase 2-2
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -111,6 +111,8 @@ async def create_incident_for_approval(
|
||||
source: str = "alertmanager",
|
||||
alertname: str | None = None,
|
||||
alert_labels: dict | None = None,
|
||||
notification_type: str | None = None, # ADR-073 Phase 2-2
|
||||
alert_category: str | None = None, # ADR-073 Phase 2-2
|
||||
) -> str:
|
||||
"""
|
||||
為 Approval 創建對應的 Incident (活躍事件同步)。
|
||||
@@ -151,6 +153,8 @@ async def create_incident_for_approval(
|
||||
signals=[signal],
|
||||
affected_services=_affected_services,
|
||||
proposal_ids=[UUID(approval_id)],
|
||||
notification_type=notification_type, # ADR-073 Phase 2-2
|
||||
alert_category=alert_category, # ADR-073 Phase 2-2
|
||||
)
|
||||
|
||||
await incident_service.save_to_working_memory(incident)
|
||||
|
||||
Reference in New Issue
Block a user