diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index 3ecaa7b7..ba1e38e0 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -584,6 +584,8 @@ class OpenClawService: raw_namespace = alert_context.get("namespace", "default") message = alert_context.get("message", "") metrics = alert_context.get("metrics", {}) + labels = alert_context.get("labels", {}) + alertname = labels.get("alertname", alert_type) # Phase 18.1: 正規化資源名稱 (ADR-016) # 確保 kubectl 指令使用有效的 K8s 名稱 @@ -622,7 +624,75 @@ class OpenClawService: # 根據告警類型生成專業 RCA + 仲裁 # 🔴 2026-03-29 ogt: Mock 響應必須標示為規則匹配,不是 AI 仲裁 - if "oom" in message.lower() or "memory" in alert_type.lower(): + # 2026-04-09 ogt: 新增 DockerContainerUnhealthy / TargetDown 專屬規則 + if alertname == "DockerContainerUnhealthy" or "unhealthy" in message.lower(): + container_name = labels.get("name", target) + host = labels.get("instance", "").split(":")[0] or "192.168.0.188" + mock_response = { + "action_title": f"檢查 Docker 容器 {container_name} 健康狀態", + "description": f"⚙️ 規則匹配: Docker 容器 {container_name} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。{signoz_summary}", + "suggested_action": "RESTART_DEPLOYMENT", + "kubectl_command": f"ssh {host} 'docker inspect {container_name} --format=\"{{{{.State.Health.Status}}}}\" && docker restart {container_name}'", + "target_resource": container_name, + "namespace": namespace, + "risk_level": "medium", + "blast_radius": { + "affected_pods": 1, + "estimated_downtime": "~30s", + "related_services": [container_name], + "data_impact": "NONE", + }, + "primary_responsibility": "INFRA", + "responsibility_reasoning": "Docker 容器健康檢查失敗屬基礎設施團隊責任,需確認 healthcheck 設定與容器狀態", + "secondary_teams": ["BE"], + "optimization_suggestions": [ + { + "type": "HEALTHCHECK", + "description": "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)", + "kubectl_or_config": f"ssh {host} 'docker exec {container_name} sh -c \"mc ready local || curl -f http://localhost:9000/minio/health/live\"'", + } + ], + "reasoning": f"[規則匹配] Docker healthcheck 失敗先 restart 恢復服務,同時確認 healthcheck 指令正確。{signoz_correlation}", + "deviation_analysis": "容器健康檢查連續失敗,超出允許次數", + "confidence": 0.0, + "affected_services": [container_name], + "signoz_correlation": signoz_correlation, + } + elif alertname == "TargetDown" or (labels.get("job") and ":" in raw_target): + instance = labels.get("instance", raw_target) + job = labels.get("job", "exporter") + host = instance.split(":")[0] + mock_response = { + "action_title": f"檢查 {job} ({instance}) 服務存活", + "description": f"⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。{signoz_summary}", + "suggested_action": "RESTART_DEPLOYMENT", + "kubectl_command": f"ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'", + "target_resource": instance, + "namespace": namespace, + "risk_level": "medium", + "blast_radius": { + "affected_pods": 0, + "estimated_downtime": "監控盲區持續中", + "related_services": [job], + "data_impact": "NONE", + }, + "primary_responsibility": "INFRA", + "responsibility_reasoning": "Prometheus scrape 目標下線屬基礎設施監控範疇", + "secondary_teams": [], + "optimization_suggestions": [ + { + "type": "MONITORING", + "description": f"確認 {host} 上的 {job} exporter 是否正常運行", + "kubectl_or_config": f"ssh {host} 'ps aux | grep exporter'", + } + ], + "reasoning": f"[規則匹配] Prometheus target 下線,先 SSH 確認主機存活再重啟 exporter。{signoz_correlation}", + "deviation_analysis": "Prometheus scrape 失敗,監控數據中斷", + "confidence": 0.0, + "affected_services": [instance], + "signoz_correlation": signoz_correlation, + } + elif "oom" in message.lower() or "memory" in alert_type.lower(): mock_response = { "action_title": f"刪除異常 Pod {target} (OOMKilled)", "description": f"⚙️ 規則匹配: {target} 發生 OOMKilled,根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。{signoz_summary}",