diff --git a/apps/api/src/api/v1/signoz_webhook.py b/apps/api/src/api/v1/signoz_webhook.py index f9d136fe..3123ad77 100644 --- a/apps/api/src/api/v1/signoz_webhook.py +++ b/apps/api/src/api/v1/signoz_webhook.py @@ -136,10 +136,13 @@ async def process_signoz_alert( ADR-037 Phase 21: 完整告警處理流程 1. 記錄異常頻率 2. 建立 Incident - 3. 建立 Approval - 4. 發送 Telegram + 3. 呼叫 OpenClaw AI 進行 RCA 分析 (Nemo-4B 優先) + 4. 建立 Approval (帶入 AI 建議) + 5. 發送 Telegram (帶入 AI 信心度) """ try: + from src.services.openclaw import get_openclaw + openclaw = get_openclaw() # ================================================================= # Step 1: 記錄異常頻率 (ADR-037) # ================================================================= @@ -193,7 +196,25 @@ async def process_signoz_alert( return # ================================================================= - # Step 3: 建立 Approval + # Step 3: 呼叫 OpenClaw AI 進行 RCA 分析 + # ================================================================= + alert_context = { + "alert_type": alert_name, + "severity": severity, + "target_resource": labels.get("service_name", labels.get("service", "unknown")), + "namespace": labels.get("namespace", "default"), + "message": annotations.get("description", annotations.get("summary", "")), + "fingerprint": f"signoz-{alert_name}-{labels.get('service_name', 'unknown')}", + "anomaly_frequency": anomaly_frequency, + } + + # 這裡會呼叫 NVIDIA Nemo -> Gemini -> Ollama 鏈路 + analysis_result, ai_provider, raw_response, signoz_metrics, trace_url, tokens, cost = await openclaw.analyze_alert( + alert_context + ) + + # ================================================================= + # Step 4: 建立 Approval # ================================================================= approval_id = await create_signoz_approval( alert_name=alert_name, @@ -202,10 +223,11 @@ async def process_signoz_alert( severity=severity, incident_id=incident.incident_id, anomaly_frequency=anomaly_frequency, + analysis_result=analysis_result, # 帶入 AI 結果 ) # ================================================================= - # Step 4: 發送 Telegram 告警 + # Step 5: 發送 Telegram 告警 # ================================================================= await send_signoz_telegram( approval_id=approval_id, @@ -214,6 +236,8 @@ async def process_signoz_alert( annotations=annotations, severity=severity, anomaly_frequency=anomaly_frequency, + analysis_result=analysis_result, # 帶入 AI 結果 + ai_provider=ai_provider, ) logger.info( @@ -244,6 +268,7 @@ async def create_signoz_approval( severity: str, incident_id: str, anomaly_frequency: dict | None = None, + analysis_result: "LLMAnalysisResult" | None = None, ) -> str: """ 為 SignOz 告警建立 Approval 記錄 @@ -266,20 +291,37 @@ async def create_signoz_approval( # 建立 Approval service_name = labels.get("service_name", labels.get("service", "unknown")) - summary = annotations.get("summary", f"SignOz Alert: {alert_name}") - description = annotations.get("description", summary) + summary = ( + analysis_result.action_title + if analysis_result + else annotations.get("summary", f"SignOz Alert: {alert_name}") + ) + description = ( + analysis_result.description + if analysis_result + else annotations.get("description", summary) + ) - # P1-2 修正: 欄位對齊 ApprovalRequestBase (2026-03-29) + # Step 4.2: 決定建議動作與指令 + action = summary + command = "" + if analysis_result: + command = analysis_result.kubectl_command + # 如果 AI 建議重啟但 annotations 有不同建議,以 AI 為準 + action = f"[AI 建議] {analysis_result.action_title}" + else: + action = f"SignOz Alert: {alert_name}" approval_request = ApprovalRequestCreate( - action=f"SignOz Alert: {alert_name}", + action=action, description=description, - risk_level=risk_level, - blast_radius=BlastRadius( + risk_level=analysis_result.risk_level if analysis_result else risk_level, + blast_radius=analysis_result.blast_radius if analysis_result else BlastRadius( affected_pods=1, estimated_downtime="0", related_services=[service_name], data_impact=DataImpact.READ_ONLY, ), + kubectl_command=command, dry_run_checks=[], requested_by="signoz-webhook", metadata={ @@ -289,6 +331,7 @@ async def create_signoz_approval( "annotations": annotations, "incident_id": incident_id, "anomaly_frequency": anomaly_frequency, + "ai_analyzed": analysis_result is not None, }, ) @@ -308,6 +351,8 @@ async def send_signoz_telegram( annotations: dict, severity: str, anomaly_frequency: dict | None = None, + analysis_result: "LLMAnalysisResult" | None = None, + ai_provider: str = "none", ): """ 發送 SignOz 告警到 Telegram @@ -324,15 +369,19 @@ async def send_signoz_telegram( await telegram.send_approval_card( approval_id=approval_id, - risk_level="critical" if severity == "critical" else ( - "high" if severity == "error" else "medium" + risk_level=analysis_result.risk_level if analysis_result else ( + "critical" if severity == "critical" else ( + "high" if severity == "error" else "medium" + ) ), resource_name=service_name, - root_cause=summary, - suggested_action=description or "請檢查 SignOz 儀表板", - primary_responsibility="BE", - confidence=0.0, # 🔴 規則匹配/告警轉發,非 AI 分析 - namespace="signoz", + root_cause=analysis_result.description if analysis_result else summary, + suggested_action=analysis_result.action_title if analysis_result else ( + description or "請檢查 SignOz 儀表板" + ), + primary_responsibility=analysis_result.primary_responsibility if analysis_result else "BE", + confidence=analysis_result.confidence if analysis_result else 0.0, + namespace=labels.get("namespace", "signoz"), anomaly_frequency=anomaly_frequency, )