feat(api): Phase 18.4 OpenClaw 深度分析整合
All checks were successful
E2E Health Check / e2e-health (push) Successful in 17s
All checks were successful
E2E Health Check / e2e-health (push) Successful in 17s
2026-03-31 Claude Code (統帥批准) 新增功能: - _llm_analyze() 整合 OpenClawService - 使用 analyze_alert() 進行 AI RCA 分析 - 整合 SignOz 監控數據 - 支援 Token/Cost 追蹤 - _map_severity_to_risk(): 嚴重度→風險等級映射 - critical/高 → CRITICAL - warning/medium/中 → MEDIUM - 其他 → LOW - _extract_repair_action(): 從 AI 建議提取可執行動作 - restart/重啟 → restart_deployment/restart_pod - clear/清理/cache → clear_cache - scale/擴展 → scale_up (需人工授權) 閉環強化: 規則引擎初步分類 → OpenClaw AI 深度分析 → 更精準的修復建議 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -495,65 +495,114 @@ class FailureWatcherService(IFailureWatcher):
|
||||
target_resource: str,
|
||||
initial_classification: str,
|
||||
) -> dict | None:
|
||||
"""LLM 深度分析失敗原因"""
|
||||
"""
|
||||
LLM 深度分析失敗原因
|
||||
|
||||
Phase 18.4: OpenClaw 整合
|
||||
2026-03-31 Claude Code (統帥批准)
|
||||
|
||||
使用 OpenClawService 進行 AI 分析,
|
||||
整合 SignOz 監控數據提供更精準的 RCA。
|
||||
"""
|
||||
try:
|
||||
# 建構 prompt
|
||||
prompt = f"""你是 AIOps 故障分析專家。請分析以下執行失敗:
|
||||
from src.services.openclaw import get_openclaw_service
|
||||
|
||||
錯誤訊息: {error_message}
|
||||
操作類型: {operation_type}
|
||||
目標資源: {target_resource}
|
||||
初步分類: {initial_classification}
|
||||
openclaw = get_openclaw_service()
|
||||
|
||||
請以 JSON 格式回應:
|
||||
{{
|
||||
"classification": "TIMEOUT|K8S_ERROR|NETWORK_ERROR|PERMISSION_DENIED|RESOURCE_ERROR|UNKNOWN",
|
||||
"root_cause": "根本原因分析 (30字內)",
|
||||
"suggested_repair": "建議修復策略 (30字內)",
|
||||
"risk_level": "LOW|MEDIUM|CRITICAL",
|
||||
"confidence": 0.0-1.0
|
||||
}}
|
||||
# 建構告警上下文
|
||||
alert_context = {
|
||||
"alert_type": "execution_failure",
|
||||
"severity": "warning",
|
||||
"error_message": error_message,
|
||||
"operation_type": operation_type,
|
||||
"target_resource": target_resource,
|
||||
"initial_classification": initial_classification,
|
||||
"source": "failure_watcher",
|
||||
}
|
||||
|
||||
只輸出 JSON,不要其他文字。"""
|
||||
|
||||
# 呼叫 LLM
|
||||
result = await self._model_registry.chat(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
task_type="failure_analysis",
|
||||
# 呼叫 OpenClaw 分析 (含 SignOz 整合)
|
||||
analysis_result, ai_provider, raw_response, signoz_metrics, trace_url, tokens, cost = (
|
||||
await openclaw.analyze_alert(alert_context)
|
||||
)
|
||||
|
||||
if not result or not result.get("success"):
|
||||
logger.warning(
|
||||
"llm_failure_analysis_failed",
|
||||
error=result.get("error") if result else "No response",
|
||||
)
|
||||
return None
|
||||
|
||||
# 解析 JSON
|
||||
content = result.get("content", "")
|
||||
# 嘗試提取 JSON
|
||||
import re
|
||||
|
||||
json_match = re.search(r"\{[^}]+\}", content, re.DOTALL)
|
||||
if json_match:
|
||||
analysis = json.loads(json_match.group())
|
||||
if analysis_result:
|
||||
# 從 OpenClaw 結果建構修復分析
|
||||
logger.info(
|
||||
"llm_failure_analysis_success",
|
||||
classification=analysis.get("classification"),
|
||||
risk_level=analysis.get("risk_level"),
|
||||
confidence=analysis.get("confidence"),
|
||||
"openclaw_failure_analysis_success",
|
||||
ai_provider=ai_provider,
|
||||
severity=analysis_result.severity,
|
||||
confidence=analysis_result.confidence,
|
||||
tokens=tokens,
|
||||
cost_usd=cost,
|
||||
)
|
||||
return analysis
|
||||
|
||||
# 映射 OpenClaw 結果到修復分析格式
|
||||
risk_level = self._map_severity_to_risk(analysis_result.severity)
|
||||
|
||||
return {
|
||||
"classification": initial_classification, # 保留規則引擎分類
|
||||
"root_cause": analysis_result.root_cause_analysis[:100],
|
||||
"suggested_repair": self._extract_repair_action(
|
||||
analysis_result.recommended_action
|
||||
),
|
||||
"risk_level": risk_level,
|
||||
"confidence": analysis_result.confidence,
|
||||
"ai_provider": ai_provider,
|
||||
"signoz_trace_url": trace_url,
|
||||
}
|
||||
|
||||
logger.warning(
|
||||
"openclaw_failure_analysis_no_result",
|
||||
raw_response=raw_response[:200] if raw_response else None,
|
||||
)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"llm_failure_analysis_error",
|
||||
"openclaw_failure_analysis_error",
|
||||
error=str(e),
|
||||
)
|
||||
return None
|
||||
|
||||
def _map_severity_to_risk(self, severity: str) -> str:
|
||||
"""
|
||||
將 OpenClaw severity 映射到修復風險等級
|
||||
|
||||
Phase 18.4: 嚴重度映射
|
||||
"""
|
||||
severity_lower = severity.lower()
|
||||
if severity_lower in ["critical", "高"]:
|
||||
return "CRITICAL"
|
||||
elif severity_lower in ["warning", "medium", "中"]:
|
||||
return "MEDIUM"
|
||||
else:
|
||||
return "LOW"
|
||||
|
||||
def _extract_repair_action(self, recommended_action: str) -> str:
|
||||
"""
|
||||
從 OpenClaw 建議中提取可執行的修復動作
|
||||
|
||||
Phase 18.4: 動作提取
|
||||
"""
|
||||
action_lower = recommended_action.lower()
|
||||
|
||||
# 識別可自動執行的動作
|
||||
if any(kw in action_lower for kw in ["restart", "重啟", "重新啟動"]):
|
||||
if "deployment" in action_lower or "部署" in action_lower:
|
||||
return "restart_deployment"
|
||||
elif "pod" in action_lower:
|
||||
return "restart_pod"
|
||||
return "restart_pod" # 預設重啟 Pod
|
||||
|
||||
if any(kw in action_lower for kw in ["clear", "清理", "cache", "快取"]):
|
||||
return "clear_cache"
|
||||
|
||||
if any(kw in action_lower for kw in ["scale", "擴展", "增加"]):
|
||||
return "scale_up" # 需人工授權
|
||||
|
||||
# 無法自動執行,返回原始建議
|
||||
return recommended_action[:50]
|
||||
|
||||
async def _update_audit_log_classification(
|
||||
self,
|
||||
audit_log_id: str,
|
||||
|
||||
Reference in New Issue
Block a user