refactor(webhook): enable OpenClaw AI RCA for SignOz alerts
All checks were successful
E2E Health Check / e2e-health (push) Successful in 16s

This commit is contained in:
OG T
2026-03-31 15:25:03 +08:00
parent dffb535220
commit 3b7098caef

View File

@@ -136,10 +136,13 @@ async def process_signoz_alert(
ADR-037 Phase 21: 完整告警處理流程
1. 記錄異常頻率
2. 建立 Incident
3. 建立 Approval
4. 發送 Telegram
3. 呼叫 OpenClaw AI 進行 RCA 分析 (Nemo-4B 優先)
4. 建立 Approval (帶入 AI 建議)
5. 發送 Telegram (帶入 AI 信心度)
"""
try:
from src.services.openclaw import get_openclaw
openclaw = get_openclaw()
# =================================================================
# Step 1: 記錄異常頻率 (ADR-037)
# =================================================================
@@ -193,7 +196,25 @@ async def process_signoz_alert(
return
# =================================================================
# Step 3: 建立 Approval
# Step 3: 呼叫 OpenClaw AI 進行 RCA 分析
# =================================================================
alert_context = {
"alert_type": alert_name,
"severity": severity,
"target_resource": labels.get("service_name", labels.get("service", "unknown")),
"namespace": labels.get("namespace", "default"),
"message": annotations.get("description", annotations.get("summary", "")),
"fingerprint": f"signoz-{alert_name}-{labels.get('service_name', 'unknown')}",
"anomaly_frequency": anomaly_frequency,
}
# 這裡會呼叫 NVIDIA Nemo -> Gemini -> Ollama 鏈路
analysis_result, ai_provider, raw_response, signoz_metrics, trace_url, tokens, cost = await openclaw.analyze_alert(
alert_context
)
# =================================================================
# Step 4: 建立 Approval
# =================================================================
approval_id = await create_signoz_approval(
alert_name=alert_name,
@@ -202,10 +223,11 @@ async def process_signoz_alert(
severity=severity,
incident_id=incident.incident_id,
anomaly_frequency=anomaly_frequency,
analysis_result=analysis_result, # 帶入 AI 結果
)
# =================================================================
# Step 4: 發送 Telegram 告警
# Step 5: 發送 Telegram 告警
# =================================================================
await send_signoz_telegram(
approval_id=approval_id,
@@ -214,6 +236,8 @@ async def process_signoz_alert(
annotations=annotations,
severity=severity,
anomaly_frequency=anomaly_frequency,
analysis_result=analysis_result, # 帶入 AI 結果
ai_provider=ai_provider,
)
logger.info(
@@ -244,6 +268,7 @@ async def create_signoz_approval(
severity: str,
incident_id: str,
anomaly_frequency: dict | None = None,
analysis_result: "LLMAnalysisResult" | None = None,
) -> str:
"""
為 SignOz 告警建立 Approval 記錄
@@ -266,20 +291,37 @@ async def create_signoz_approval(
# 建立 Approval
service_name = labels.get("service_name", labels.get("service", "unknown"))
summary = annotations.get("summary", f"SignOz Alert: {alert_name}")
description = annotations.get("description", summary)
summary = (
analysis_result.action_title
if analysis_result
else annotations.get("summary", f"SignOz Alert: {alert_name}")
)
description = (
analysis_result.description
if analysis_result
else annotations.get("description", summary)
)
# P1-2 修正: 欄位對齊 ApprovalRequestBase (2026-03-29)
# Step 4.2: 決定建議動作與指令
action = summary
command = ""
if analysis_result:
command = analysis_result.kubectl_command
# 如果 AI 建議重啟但 annotations 有不同建議,以 AI 為準
action = f"[AI 建議] {analysis_result.action_title}"
else:
action = f"SignOz Alert: {alert_name}"
approval_request = ApprovalRequestCreate(
action=f"SignOz Alert: {alert_name}",
action=action,
description=description,
risk_level=risk_level,
blast_radius=BlastRadius(
risk_level=analysis_result.risk_level if analysis_result else risk_level,
blast_radius=analysis_result.blast_radius if analysis_result else BlastRadius(
affected_pods=1,
estimated_downtime="0",
related_services=[service_name],
data_impact=DataImpact.READ_ONLY,
),
kubectl_command=command,
dry_run_checks=[],
requested_by="signoz-webhook",
metadata={
@@ -289,6 +331,7 @@ async def create_signoz_approval(
"annotations": annotations,
"incident_id": incident_id,
"anomaly_frequency": anomaly_frequency,
"ai_analyzed": analysis_result is not None,
},
)
@@ -308,6 +351,8 @@ async def send_signoz_telegram(
annotations: dict,
severity: str,
anomaly_frequency: dict | None = None,
analysis_result: "LLMAnalysisResult" | None = None,
ai_provider: str = "none",
):
"""
發送 SignOz 告警到 Telegram
@@ -324,15 +369,19 @@ async def send_signoz_telegram(
await telegram.send_approval_card(
approval_id=approval_id,
risk_level="critical" if severity == "critical" else (
"high" if severity == "error" else "medium"
risk_level=analysis_result.risk_level if analysis_result else (
"critical" if severity == "critical" else (
"high" if severity == "error" else "medium"
)
),
resource_name=service_name,
root_cause=summary,
suggested_action=description or "請檢查 SignOz 儀表板",
primary_responsibility="BE",
confidence=0.0, # 🔴 規則匹配/告警轉發,非 AI 分析
namespace="signoz",
root_cause=analysis_result.description if analysis_result else summary,
suggested_action=analysis_result.action_title if analysis_result else (
description or "請檢查 SignOz 儀表板"
),
primary_responsibility=analysis_result.primary_responsibility if analysis_result else "BE",
confidence=analysis_result.confidence if analysis_result else 0.0,
namespace=labels.get("namespace", "signoz"),
anomaly_frequency=anomaly_frequency,
)