fix(backfill): 補充 ADR-075 三種新分類 (secops/flywheel_health/business)
_classify_alert() 與 classify_alert_early() 規則對齊, 確保回填腳本正確分類存量 incidents。 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,10 +1,14 @@
|
||||
"""
|
||||
ADR-073 Phase 2-5: 回填 incidents.alertname 欄位
|
||||
2026-04-12 ogt: 舊資料 signals JSONB 用 alert_name key (非 alertname alias)
|
||||
新資料 (Phase 2-1 修復後) 已有 alertname alias
|
||||
ADR-073 回填腳本:修補存量 incidents 三個 NULL 欄位
|
||||
- alertname
|
||||
- notification_type
|
||||
- alert_category
|
||||
|
||||
根本原因:save_to_episodic_memory() 建立 IncidentRecord 時漏掉這三個欄位。
|
||||
2026-04-12 ogt (ADR-073 修補 Fix #1)
|
||||
|
||||
執行方式:
|
||||
kubectl exec -n awoooi-prod <api-pod> -- python3 scripts/backfill_alertname.py
|
||||
kubectl exec -n awoooi-prod <api-pod> -- python3 /app/scripts/backfill_alertname.py
|
||||
"""
|
||||
import asyncio
|
||||
import sys
|
||||
@@ -15,40 +19,119 @@ from sqlalchemy import text
|
||||
from src.db.base import get_db_context
|
||||
|
||||
|
||||
def _classify_alert(alertname: str, severity: str) -> tuple[str, str]:
|
||||
"""Python 版分類邏輯,與 classify_alert_early() 保持一致 (ADR-075 更新)"""
|
||||
alertname_lower = alertname.lower()
|
||||
if alertname in ("ConfigurationDrift", "KubeConfigDrift"):
|
||||
return "config_drift", "TYPE-4D"
|
||||
if severity in ("info", "none"):
|
||||
return "info", "TYPE-1"
|
||||
if "watchdog" in alertname_lower or alertname in ("Heartbeat",):
|
||||
return "backup", "TYPE-1"
|
||||
# ADR-075 新增: SecOps 優先
|
||||
if any(alertname.startswith(p) for p in ("UnauthorizedSSH", "KubeAudit", "CVECritical", "WAFAttack", "PodAbnormal", "SecurityBreach")):
|
||||
return "secops", "TYPE-5S"
|
||||
# ADR-075 新增: Flywheel/META
|
||||
if alertname in ("AutoRepairLowSuccessRate", "PermanentFixRequired") or any(
|
||||
alertname.startswith(p) for p in ("Flywheel", "MCPProvider", "OllamaDown", "NemotronDown")
|
||||
):
|
||||
return "flywheel_health", "TYPE-8M"
|
||||
# ADR-075 新增: Business/FinOps
|
||||
if any(alertname.startswith(p) for p in ("AITokenCost", "GeminiAPIError", "SLOBurn", "APIErrorBudget", "MomoScraper", "ScraperSuccess")):
|
||||
return "business", "TYPE-6B"
|
||||
if alertname.startswith(("Docker", "Host")):
|
||||
return "infrastructure", "TYPE-3"
|
||||
if alertname.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")):
|
||||
return "kubernetes", "TYPE-3"
|
||||
if alertname.startswith(("Postgres", "Redis")):
|
||||
return "database", "TYPE-3"
|
||||
return "general", "TYPE-3"
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
async with get_db_context() as db:
|
||||
# 先查有多少需要回填
|
||||
count_r = await db.execute(
|
||||
text("SELECT COUNT(*) FROM incidents WHERE alertname IS NULL")
|
||||
)
|
||||
total = count_r.scalar()
|
||||
print(f"待回填: {total} 筆")
|
||||
# --- Step 1: 統計目前 NULL 狀況 ---
|
||||
null_r = await db.execute(text("""
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE alertname IS NULL) AS alertname_null,
|
||||
COUNT(*) FILTER (WHERE notification_type IS NULL) AS notification_type_null,
|
||||
COUNT(*) FILTER (WHERE alert_category IS NULL) AS alert_category_null,
|
||||
COUNT(*) AS total
|
||||
FROM incidents
|
||||
"""))
|
||||
row = null_r.fetchone()
|
||||
print(f"總計: {row.total} 筆")
|
||||
print(f" alertname NULL: {row.alertname_null}")
|
||||
print(f" notification_type NULL: {row.notification_type_null}")
|
||||
print(f" alert_category NULL: {row.alert_category_null}")
|
||||
|
||||
# 優先用 signals->0->>'alertname' (Phase 2-1 修復後的新寫法)
|
||||
# fallback: signals->0->>'alert_name' (舊寫法)
|
||||
result = await db.execute(
|
||||
text("""
|
||||
UPDATE incidents
|
||||
SET alertname = COALESCE(
|
||||
signals->0->>'alertname',
|
||||
signals->0->>'alert_name'
|
||||
)
|
||||
WHERE alertname IS NULL
|
||||
AND (
|
||||
signals->0->>'alertname' IS NOT NULL
|
||||
OR signals->0->>'alert_name' IS NOT NULL
|
||||
)
|
||||
""")
|
||||
)
|
||||
# --- Step 2: SQL 回填 alertname ---
|
||||
# signals 是 JSONB array,Signal 模型序列化後的 key 是 alert_name
|
||||
# labels dict 裡也有 alertname key
|
||||
result = await db.execute(text("""
|
||||
UPDATE incidents
|
||||
SET alertname = COALESCE(
|
||||
signals->0->'labels'->>'alertname',
|
||||
signals->0->>'alert_name',
|
||||
signals->0->>'alertname'
|
||||
)
|
||||
WHERE alertname IS NULL
|
||||
AND signals IS NOT NULL
|
||||
AND json_array_length(signals) > 0
|
||||
"""))
|
||||
await db.commit()
|
||||
print(f"已回填: {result.rowcount} 筆")
|
||||
print(f"\n✅ alertname 回填: {result.rowcount} 筆")
|
||||
|
||||
# 剩餘無法回填的
|
||||
remain_r = await db.execute(
|
||||
text("SELECT COUNT(*) FROM incidents WHERE alertname IS NULL")
|
||||
)
|
||||
remain = remain_r.scalar()
|
||||
print(f"仍為 NULL: {remain} 筆 (signals 無 alert_name 欄位,無法回填)")
|
||||
# --- Step 3: Python 回填 notification_type + alert_category ---
|
||||
# 需要 Python 跑分類邏輯,從 DB 讀出 alertname + severity 批次更新
|
||||
rows_r = await db.execute(text("""
|
||||
SELECT incident_id, alertname, severity
|
||||
FROM incidents
|
||||
WHERE (notification_type IS NULL OR alert_category IS NULL)
|
||||
AND alertname IS NOT NULL
|
||||
"""))
|
||||
rows = rows_r.fetchall()
|
||||
print(f"\n待分類回填: {len(rows)} 筆")
|
||||
|
||||
updated = 0
|
||||
for row in rows:
|
||||
alert_category, notification_type = _classify_alert(
|
||||
alertname=row.alertname or "",
|
||||
severity=row.severity or "warning",
|
||||
)
|
||||
await db.execute(text("""
|
||||
UPDATE incidents
|
||||
SET notification_type = :notification_type,
|
||||
alert_category = :alert_category
|
||||
WHERE incident_id = :incident_id
|
||||
"""), {
|
||||
"notification_type": notification_type,
|
||||
"alert_category": alert_category,
|
||||
"incident_id": row.incident_id,
|
||||
})
|
||||
updated += 1
|
||||
|
||||
await db.commit()
|
||||
print(f"✅ notification_type + alert_category 回填: {updated} 筆")
|
||||
|
||||
# --- Step 4: 最終統計 ---
|
||||
final_r = await db.execute(text("""
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE alertname IS NULL) AS alertname_null,
|
||||
COUNT(*) FILTER (WHERE notification_type IS NULL) AS notification_type_null,
|
||||
COUNT(*) FILTER (WHERE alert_category IS NULL) AS alert_category_null
|
||||
FROM incidents
|
||||
"""))
|
||||
f = final_r.fetchone()
|
||||
print(f"\n最終 NULL 統計:")
|
||||
print(f" alertname NULL: {f.alertname_null}")
|
||||
print(f" notification_type NULL: {f.notification_type_null}")
|
||||
print(f" alert_category NULL: {f.alert_category_null}")
|
||||
|
||||
if f.alertname_null == 0 and f.notification_type_null == 0 and f.alert_category_null == 0:
|
||||
print("\n✅ 三個欄位全部回填完成")
|
||||
else:
|
||||
print("\n⚠️ 部分記錄仍為 NULL (可能 signals 為空或格式異常)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user