feat(api): Phase 18.4 OpenClaw 深度分析整合
All checks were successful
E2E Health Check / e2e-health (push) Successful in 17s

2026-03-31 Claude Code (統帥批准)

新增功能:
- _llm_analyze() 整合 OpenClawService
  - 使用 analyze_alert() 進行 AI RCA 分析
  - 整合 SignOz 監控數據
  - 支援 Token/Cost 追蹤

- _map_severity_to_risk(): 嚴重度→風險等級映射
  - critical/高 → CRITICAL
  - warning/medium/中 → MEDIUM
  - 其他 → LOW

- _extract_repair_action(): 從 AI 建議提取可執行動作
  - restart/重啟 → restart_deployment/restart_pod
  - clear/清理/cache → clear_cache
  - scale/擴展 → scale_up (需人工授權)

閉環強化:
規則引擎初步分類 → OpenClaw AI 深度分析 → 更精準的修復建議

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-03-31 12:14:54 +08:00
parent 9a2b1d6653
commit d6f37853c5

View File

@@ -495,65 +495,114 @@ class FailureWatcherService(IFailureWatcher):
target_resource: str,
initial_classification: str,
) -> dict | None:
"""LLM 深度分析失敗原因"""
"""
LLM 深度分析失敗原因
Phase 18.4: OpenClaw 整合
2026-03-31 Claude Code (統帥批准)
使用 OpenClawService 進行 AI 分析,
整合 SignOz 監控數據提供更精準的 RCA。
"""
try:
# 建構 prompt
prompt = f"""你是 AIOps 故障分析專家。請分析以下執行失敗:
from src.services.openclaw import get_openclaw_service
錯誤訊息: {error_message}
操作類型: {operation_type}
目標資源: {target_resource}
初步分類: {initial_classification}
openclaw = get_openclaw_service()
請以 JSON 格式回應:
{{
"classification": "TIMEOUT|K8S_ERROR|NETWORK_ERROR|PERMISSION_DENIED|RESOURCE_ERROR|UNKNOWN",
"root_cause": "根本原因分析 (30字內)",
"suggested_repair": "建議修復策略 (30字內)",
"risk_level": "LOW|MEDIUM|CRITICAL",
"confidence": 0.0-1.0
}}
# 建構告警上下文
alert_context = {
"alert_type": "execution_failure",
"severity": "warning",
"error_message": error_message,
"operation_type": operation_type,
"target_resource": target_resource,
"initial_classification": initial_classification,
"source": "failure_watcher",
}
只輸出 JSON不要其他文字。"""
# 呼叫 LLM
result = await self._model_registry.chat(
messages=[{"role": "user", "content": prompt}],
task_type="failure_analysis",
# 呼叫 OpenClaw 分析 (含 SignOz 整合)
analysis_result, ai_provider, raw_response, signoz_metrics, trace_url, tokens, cost = (
await openclaw.analyze_alert(alert_context)
)
if not result or not result.get("success"):
logger.warning(
"llm_failure_analysis_failed",
error=result.get("error") if result else "No response",
)
return None
# 解析 JSON
content = result.get("content", "")
# 嘗試提取 JSON
import re
json_match = re.search(r"\{[^}]+\}", content, re.DOTALL)
if json_match:
analysis = json.loads(json_match.group())
if analysis_result:
# 從 OpenClaw 結果建構修復分析
logger.info(
"llm_failure_analysis_success",
classification=analysis.get("classification"),
risk_level=analysis.get("risk_level"),
confidence=analysis.get("confidence"),
"openclaw_failure_analysis_success",
ai_provider=ai_provider,
severity=analysis_result.severity,
confidence=analysis_result.confidence,
tokens=tokens,
cost_usd=cost,
)
return analysis
# 映射 OpenClaw 結果到修復分析格式
risk_level = self._map_severity_to_risk(analysis_result.severity)
return {
"classification": initial_classification, # 保留規則引擎分類
"root_cause": analysis_result.root_cause_analysis[:100],
"suggested_repair": self._extract_repair_action(
analysis_result.recommended_action
),
"risk_level": risk_level,
"confidence": analysis_result.confidence,
"ai_provider": ai_provider,
"signoz_trace_url": trace_url,
}
logger.warning(
"openclaw_failure_analysis_no_result",
raw_response=raw_response[:200] if raw_response else None,
)
return None
except Exception as e:
logger.warning(
"llm_failure_analysis_error",
"openclaw_failure_analysis_error",
error=str(e),
)
return None
def _map_severity_to_risk(self, severity: str) -> str:
"""
將 OpenClaw severity 映射到修復風險等級
Phase 18.4: 嚴重度映射
"""
severity_lower = severity.lower()
if severity_lower in ["critical", ""]:
return "CRITICAL"
elif severity_lower in ["warning", "medium", ""]:
return "MEDIUM"
else:
return "LOW"
def _extract_repair_action(self, recommended_action: str) -> str:
"""
從 OpenClaw 建議中提取可執行的修復動作
Phase 18.4: 動作提取
"""
action_lower = recommended_action.lower()
# 識別可自動執行的動作
if any(kw in action_lower for kw in ["restart", "重啟", "重新啟動"]):
if "deployment" in action_lower or "部署" in action_lower:
return "restart_deployment"
elif "pod" in action_lower:
return "restart_pod"
return "restart_pod" # 預設重啟 Pod
if any(kw in action_lower for kw in ["clear", "清理", "cache", "快取"]):
return "clear_cache"
if any(kw in action_lower for kw in ["scale", "擴展", "增加"]):
return "scale_up" # 需人工授權
# 無法自動執行,返回原始建議
return recommended_action[:50]
async def _update_audit_log_classification(
self,
audit_log_id: str,