fix(openclaw): DockerContainerUnhealthy + TargetDown 專屬規則匹配
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

- DockerContainerUnhealthy: ssh docker inspect + docker restart,含 healthcheck 指令驗證
- TargetDown / IP:port instance: ssh 確認 exporter 存活
- 修正 target 混用 alertname 作為 deployment 名稱的問題
- alertname/labels 從 alert_context 提取供規則判斷
- 2026-04-09 ogt: 新增兩條專屬規則

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-09 09:00:31 +08:00
parent 4b6f14d9a1
commit 3abc7c2f85

View File

@@ -584,6 +584,8 @@ class OpenClawService:
raw_namespace = alert_context.get("namespace", "default")
message = alert_context.get("message", "")
metrics = alert_context.get("metrics", {})
labels = alert_context.get("labels", {})
alertname = labels.get("alertname", alert_type)
# Phase 18.1: 正規化資源名稱 (ADR-016)
# 確保 kubectl 指令使用有效的 K8s 名稱
@@ -622,7 +624,75 @@ class OpenClawService:
# 根據告警類型生成專業 RCA + 仲裁
# 🔴 2026-03-29 ogt: Mock 響應必須標示為規則匹配,不是 AI 仲裁
if "oom" in message.lower() or "memory" in alert_type.lower():
# 2026-04-09 ogt: 新增 DockerContainerUnhealthy / TargetDown 專屬規則
if alertname == "DockerContainerUnhealthy" or "unhealthy" in message.lower():
container_name = labels.get("name", target)
host = labels.get("instance", "").split(":")[0] or "192.168.0.188"
mock_response = {
"action_title": f"檢查 Docker 容器 {container_name} 健康狀態",
"description": f"⚙️ 規則匹配: Docker 容器 {container_name} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。{signoz_summary}",
"suggested_action": "RESTART_DEPLOYMENT",
"kubectl_command": f"ssh {host} 'docker inspect {container_name} --format=\"{{{{.State.Health.Status}}}}\" && docker restart {container_name}'",
"target_resource": container_name,
"namespace": namespace,
"risk_level": "medium",
"blast_radius": {
"affected_pods": 1,
"estimated_downtime": "~30s",
"related_services": [container_name],
"data_impact": "NONE",
},
"primary_responsibility": "INFRA",
"responsibility_reasoning": "Docker 容器健康檢查失敗屬基礎設施團隊責任,需確認 healthcheck 設定與容器狀態",
"secondary_teams": ["BE"],
"optimization_suggestions": [
{
"type": "HEALTHCHECK",
"description": "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)",
"kubectl_or_config": f"ssh {host} 'docker exec {container_name} sh -c \"mc ready local || curl -f http://localhost:9000/minio/health/live\"'",
}
],
"reasoning": f"[規則匹配] Docker healthcheck 失敗先 restart 恢復服務,同時確認 healthcheck 指令正確。{signoz_correlation}",
"deviation_analysis": "容器健康檢查連續失敗,超出允許次數",
"confidence": 0.0,
"affected_services": [container_name],
"signoz_correlation": signoz_correlation,
}
elif alertname == "TargetDown" or (labels.get("job") and ":" in raw_target):
instance = labels.get("instance", raw_target)
job = labels.get("job", "exporter")
host = instance.split(":")[0]
mock_response = {
"action_title": f"檢查 {job} ({instance}) 服務存活",
"description": f"⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。{signoz_summary}",
"suggested_action": "RESTART_DEPLOYMENT",
"kubectl_command": f"ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'",
"target_resource": instance,
"namespace": namespace,
"risk_level": "medium",
"blast_radius": {
"affected_pods": 0,
"estimated_downtime": "監控盲區持續中",
"related_services": [job],
"data_impact": "NONE",
},
"primary_responsibility": "INFRA",
"responsibility_reasoning": "Prometheus scrape 目標下線屬基礎設施監控範疇",
"secondary_teams": [],
"optimization_suggestions": [
{
"type": "MONITORING",
"description": f"確認 {host} 上的 {job} exporter 是否正常運行",
"kubectl_or_config": f"ssh {host} 'ps aux | grep exporter'",
}
],
"reasoning": f"[規則匹配] Prometheus target 下線,先 SSH 確認主機存活再重啟 exporter。{signoz_correlation}",
"deviation_analysis": "Prometheus scrape 失敗,監控數據中斷",
"confidence": 0.0,
"affected_services": [instance],
"signoz_correlation": signoz_correlation,
}
elif "oom" in message.lower() or "memory" in alert_type.lower():
mock_response = {
"action_title": f"刪除異常 Pod {target} (OOMKilled)",
"description": f"⚙️ 規則匹配: {target} 發生 OOMKilled根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。{signoz_summary}",