refactor(webhook): enable OpenClaw AI RCA for SignOz alerts
All checks were successful
E2E Health Check / e2e-health (push) Successful in 16s
All checks were successful
E2E Health Check / e2e-health (push) Successful in 16s
This commit is contained in:
@@ -136,10 +136,13 @@ async def process_signoz_alert(
|
||||
ADR-037 Phase 21: 完整告警處理流程
|
||||
1. 記錄異常頻率
|
||||
2. 建立 Incident
|
||||
3. 建立 Approval
|
||||
4. 發送 Telegram
|
||||
3. 呼叫 OpenClaw AI 進行 RCA 分析 (Nemo-4B 優先)
|
||||
4. 建立 Approval (帶入 AI 建議)
|
||||
5. 發送 Telegram (帶入 AI 信心度)
|
||||
"""
|
||||
try:
|
||||
from src.services.openclaw import get_openclaw
|
||||
openclaw = get_openclaw()
|
||||
# =================================================================
|
||||
# Step 1: 記錄異常頻率 (ADR-037)
|
||||
# =================================================================
|
||||
@@ -193,7 +196,25 @@ async def process_signoz_alert(
|
||||
return
|
||||
|
||||
# =================================================================
|
||||
# Step 3: 建立 Approval
|
||||
# Step 3: 呼叫 OpenClaw AI 進行 RCA 分析
|
||||
# =================================================================
|
||||
alert_context = {
|
||||
"alert_type": alert_name,
|
||||
"severity": severity,
|
||||
"target_resource": labels.get("service_name", labels.get("service", "unknown")),
|
||||
"namespace": labels.get("namespace", "default"),
|
||||
"message": annotations.get("description", annotations.get("summary", "")),
|
||||
"fingerprint": f"signoz-{alert_name}-{labels.get('service_name', 'unknown')}",
|
||||
"anomaly_frequency": anomaly_frequency,
|
||||
}
|
||||
|
||||
# 這裡會呼叫 NVIDIA Nemo -> Gemini -> Ollama 鏈路
|
||||
analysis_result, ai_provider, raw_response, signoz_metrics, trace_url, tokens, cost = await openclaw.analyze_alert(
|
||||
alert_context
|
||||
)
|
||||
|
||||
# =================================================================
|
||||
# Step 4: 建立 Approval
|
||||
# =================================================================
|
||||
approval_id = await create_signoz_approval(
|
||||
alert_name=alert_name,
|
||||
@@ -202,10 +223,11 @@ async def process_signoz_alert(
|
||||
severity=severity,
|
||||
incident_id=incident.incident_id,
|
||||
anomaly_frequency=anomaly_frequency,
|
||||
analysis_result=analysis_result, # 帶入 AI 結果
|
||||
)
|
||||
|
||||
# =================================================================
|
||||
# Step 4: 發送 Telegram 告警
|
||||
# Step 5: 發送 Telegram 告警
|
||||
# =================================================================
|
||||
await send_signoz_telegram(
|
||||
approval_id=approval_id,
|
||||
@@ -214,6 +236,8 @@ async def process_signoz_alert(
|
||||
annotations=annotations,
|
||||
severity=severity,
|
||||
anomaly_frequency=anomaly_frequency,
|
||||
analysis_result=analysis_result, # 帶入 AI 結果
|
||||
ai_provider=ai_provider,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
@@ -244,6 +268,7 @@ async def create_signoz_approval(
|
||||
severity: str,
|
||||
incident_id: str,
|
||||
anomaly_frequency: dict | None = None,
|
||||
analysis_result: "LLMAnalysisResult" | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
為 SignOz 告警建立 Approval 記錄
|
||||
@@ -266,20 +291,37 @@ async def create_signoz_approval(
|
||||
|
||||
# 建立 Approval
|
||||
service_name = labels.get("service_name", labels.get("service", "unknown"))
|
||||
summary = annotations.get("summary", f"SignOz Alert: {alert_name}")
|
||||
description = annotations.get("description", summary)
|
||||
summary = (
|
||||
analysis_result.action_title
|
||||
if analysis_result
|
||||
else annotations.get("summary", f"SignOz Alert: {alert_name}")
|
||||
)
|
||||
description = (
|
||||
analysis_result.description
|
||||
if analysis_result
|
||||
else annotations.get("description", summary)
|
||||
)
|
||||
|
||||
# P1-2 修正: 欄位對齊 ApprovalRequestBase (2026-03-29)
|
||||
# Step 4.2: 決定建議動作與指令
|
||||
action = summary
|
||||
command = ""
|
||||
if analysis_result:
|
||||
command = analysis_result.kubectl_command
|
||||
# 如果 AI 建議重啟但 annotations 有不同建議,以 AI 為準
|
||||
action = f"[AI 建議] {analysis_result.action_title}"
|
||||
else:
|
||||
action = f"SignOz Alert: {alert_name}"
|
||||
approval_request = ApprovalRequestCreate(
|
||||
action=f"SignOz Alert: {alert_name}",
|
||||
action=action,
|
||||
description=description,
|
||||
risk_level=risk_level,
|
||||
blast_radius=BlastRadius(
|
||||
risk_level=analysis_result.risk_level if analysis_result else risk_level,
|
||||
blast_radius=analysis_result.blast_radius if analysis_result else BlastRadius(
|
||||
affected_pods=1,
|
||||
estimated_downtime="0",
|
||||
related_services=[service_name],
|
||||
data_impact=DataImpact.READ_ONLY,
|
||||
),
|
||||
kubectl_command=command,
|
||||
dry_run_checks=[],
|
||||
requested_by="signoz-webhook",
|
||||
metadata={
|
||||
@@ -289,6 +331,7 @@ async def create_signoz_approval(
|
||||
"annotations": annotations,
|
||||
"incident_id": incident_id,
|
||||
"anomaly_frequency": anomaly_frequency,
|
||||
"ai_analyzed": analysis_result is not None,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -308,6 +351,8 @@ async def send_signoz_telegram(
|
||||
annotations: dict,
|
||||
severity: str,
|
||||
anomaly_frequency: dict | None = None,
|
||||
analysis_result: "LLMAnalysisResult" | None = None,
|
||||
ai_provider: str = "none",
|
||||
):
|
||||
"""
|
||||
發送 SignOz 告警到 Telegram
|
||||
@@ -324,15 +369,19 @@ async def send_signoz_telegram(
|
||||
|
||||
await telegram.send_approval_card(
|
||||
approval_id=approval_id,
|
||||
risk_level="critical" if severity == "critical" else (
|
||||
"high" if severity == "error" else "medium"
|
||||
risk_level=analysis_result.risk_level if analysis_result else (
|
||||
"critical" if severity == "critical" else (
|
||||
"high" if severity == "error" else "medium"
|
||||
)
|
||||
),
|
||||
resource_name=service_name,
|
||||
root_cause=summary,
|
||||
suggested_action=description or "請檢查 SignOz 儀表板",
|
||||
primary_responsibility="BE",
|
||||
confidence=0.0, # 🔴 規則匹配/告警轉發,非 AI 分析
|
||||
namespace="signoz",
|
||||
root_cause=analysis_result.description if analysis_result else summary,
|
||||
suggested_action=analysis_result.action_title if analysis_result else (
|
||||
description or "請檢查 SignOz 儀表板"
|
||||
),
|
||||
primary_responsibility=analysis_result.primary_responsibility if analysis_result else "BE",
|
||||
confidence=analysis_result.confidence if analysis_result else 0.0,
|
||||
namespace=labels.get("namespace", "signoz"),
|
||||
anomaly_frequency=anomaly_frequency,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user