From 5fe049de558176965b127f8232fcc84102ba8f61 Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 12 Apr 2026 19:13:07 +0800 Subject: [PATCH] =?UTF-8?q?fix(backfill):=20=E8=A3=9C=E5=85=85=20ADR-075?= =?UTF-8?q?=20=E4=B8=89=E7=A8=AE=E6=96=B0=E5=88=86=E9=A1=9E=20(secops/flyw?= =?UTF-8?q?heel=5Fhealth/business)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _classify_alert() 與 classify_alert_early() 規則對齊, 確保回填腳本正確分類存量 incidents。 Co-Authored-By: Claude Sonnet 4.6 --- scripts/backfill_alertname.py | 149 ++++++++++++++++++++++++++-------- 1 file changed, 116 insertions(+), 33 deletions(-) diff --git a/scripts/backfill_alertname.py b/scripts/backfill_alertname.py index f2669270..60a63d1f 100644 --- a/scripts/backfill_alertname.py +++ b/scripts/backfill_alertname.py @@ -1,10 +1,14 @@ """ -ADR-073 Phase 2-5: 回填 incidents.alertname 欄位 -2026-04-12 ogt: 舊資料 signals JSONB 用 alert_name key (非 alertname alias) - 新資料 (Phase 2-1 修復後) 已有 alertname alias +ADR-073 回填腳本:修補存量 incidents 三個 NULL 欄位 +- alertname +- notification_type +- alert_category + +根本原因:save_to_episodic_memory() 建立 IncidentRecord 時漏掉這三個欄位。 +2026-04-12 ogt (ADR-073 修補 Fix #1) 執行方式: - kubectl exec -n awoooi-prod -- python3 scripts/backfill_alertname.py + kubectl exec -n awoooi-prod -- python3 /app/scripts/backfill_alertname.py """ import asyncio import sys @@ -15,40 +19,119 @@ from sqlalchemy import text from src.db.base import get_db_context +def _classify_alert(alertname: str, severity: str) -> tuple[str, str]: + """Python 版分類邏輯,與 classify_alert_early() 保持一致 (ADR-075 更新)""" + alertname_lower = alertname.lower() + if alertname in ("ConfigurationDrift", "KubeConfigDrift"): + return "config_drift", "TYPE-4D" + if severity in ("info", "none"): + return "info", "TYPE-1" + if "watchdog" in alertname_lower or alertname in ("Heartbeat",): + return "backup", "TYPE-1" + # ADR-075 新增: SecOps 優先 + if any(alertname.startswith(p) for p in ("UnauthorizedSSH", "KubeAudit", "CVECritical", "WAFAttack", "PodAbnormal", "SecurityBreach")): + return "secops", "TYPE-5S" + # ADR-075 新增: Flywheel/META + if alertname in ("AutoRepairLowSuccessRate", "PermanentFixRequired") or any( + alertname.startswith(p) for p in ("Flywheel", "MCPProvider", "OllamaDown", "NemotronDown") + ): + return "flywheel_health", "TYPE-8M" + # ADR-075 新增: Business/FinOps + if any(alertname.startswith(p) for p in ("AITokenCost", "GeminiAPIError", "SLOBurn", "APIErrorBudget", "MomoScraper", "ScraperSuccess")): + return "business", "TYPE-6B" + if alertname.startswith(("Docker", "Host")): + return "infrastructure", "TYPE-3" + if alertname.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")): + return "kubernetes", "TYPE-3" + if alertname.startswith(("Postgres", "Redis")): + return "database", "TYPE-3" + return "general", "TYPE-3" + + async def main() -> None: async with get_db_context() as db: - # 先查有多少需要回填 - count_r = await db.execute( - text("SELECT COUNT(*) FROM incidents WHERE alertname IS NULL") - ) - total = count_r.scalar() - print(f"待回填: {total} 筆") + # --- Step 1: 統計目前 NULL 狀況 --- + null_r = await db.execute(text(""" + SELECT + COUNT(*) FILTER (WHERE alertname IS NULL) AS alertname_null, + COUNT(*) FILTER (WHERE notification_type IS NULL) AS notification_type_null, + COUNT(*) FILTER (WHERE alert_category IS NULL) AS alert_category_null, + COUNT(*) AS total + FROM incidents + """)) + row = null_r.fetchone() + print(f"總計: {row.total} 筆") + print(f" alertname NULL: {row.alertname_null}") + print(f" notification_type NULL: {row.notification_type_null}") + print(f" alert_category NULL: {row.alert_category_null}") - # 優先用 signals->0->>'alertname' (Phase 2-1 修復後的新寫法) - # fallback: signals->0->>'alert_name' (舊寫法) - result = await db.execute( - text(""" - UPDATE incidents - SET alertname = COALESCE( - signals->0->>'alertname', - signals->0->>'alert_name' - ) - WHERE alertname IS NULL - AND ( - signals->0->>'alertname' IS NOT NULL - OR signals->0->>'alert_name' IS NOT NULL - ) - """) - ) + # --- Step 2: SQL 回填 alertname --- + # signals 是 JSONB array,Signal 模型序列化後的 key 是 alert_name + # labels dict 裡也有 alertname key + result = await db.execute(text(""" + UPDATE incidents + SET alertname = COALESCE( + signals->0->'labels'->>'alertname', + signals->0->>'alert_name', + signals->0->>'alertname' + ) + WHERE alertname IS NULL + AND signals IS NOT NULL + AND json_array_length(signals) > 0 + """)) await db.commit() - print(f"已回填: {result.rowcount} 筆") + print(f"\n✅ alertname 回填: {result.rowcount} 筆") - # 剩餘無法回填的 - remain_r = await db.execute( - text("SELECT COUNT(*) FROM incidents WHERE alertname IS NULL") - ) - remain = remain_r.scalar() - print(f"仍為 NULL: {remain} 筆 (signals 無 alert_name 欄位,無法回填)") + # --- Step 3: Python 回填 notification_type + alert_category --- + # 需要 Python 跑分類邏輯,從 DB 讀出 alertname + severity 批次更新 + rows_r = await db.execute(text(""" + SELECT incident_id, alertname, severity + FROM incidents + WHERE (notification_type IS NULL OR alert_category IS NULL) + AND alertname IS NOT NULL + """)) + rows = rows_r.fetchall() + print(f"\n待分類回填: {len(rows)} 筆") + + updated = 0 + for row in rows: + alert_category, notification_type = _classify_alert( + alertname=row.alertname or "", + severity=row.severity or "warning", + ) + await db.execute(text(""" + UPDATE incidents + SET notification_type = :notification_type, + alert_category = :alert_category + WHERE incident_id = :incident_id + """), { + "notification_type": notification_type, + "alert_category": alert_category, + "incident_id": row.incident_id, + }) + updated += 1 + + await db.commit() + print(f"✅ notification_type + alert_category 回填: {updated} 筆") + + # --- Step 4: 最終統計 --- + final_r = await db.execute(text(""" + SELECT + COUNT(*) FILTER (WHERE alertname IS NULL) AS alertname_null, + COUNT(*) FILTER (WHERE notification_type IS NULL) AS notification_type_null, + COUNT(*) FILTER (WHERE alert_category IS NULL) AS alert_category_null + FROM incidents + """)) + f = final_r.fetchone() + print(f"\n最終 NULL 統計:") + print(f" alertname NULL: {f.alertname_null}") + print(f" notification_type NULL: {f.notification_type_null}") + print(f" alert_category NULL: {f.alert_category_null}") + + if f.alertname_null == 0 and f.notification_type_null == 0 and f.alert_category_null == 0: + print("\n✅ 三個欄位全部回填完成") + else: + print("\n⚠️ 部分記錄仍為 NULL (可能 signals 為空或格式異常)") if __name__ == "__main__":