fix(openclaw): DockerContainerUnhealthy + TargetDown 專屬規則匹配
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
- DockerContainerUnhealthy: ssh docker inspect + docker restart,含 healthcheck 指令驗證 - TargetDown / IP:port instance: ssh 確認 exporter 存活 - 修正 target 混用 alertname 作為 deployment 名稱的問題 - alertname/labels 從 alert_context 提取供規則判斷 - 2026-04-09 ogt: 新增兩條專屬規則 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -584,6 +584,8 @@ class OpenClawService:
|
||||
raw_namespace = alert_context.get("namespace", "default")
|
||||
message = alert_context.get("message", "")
|
||||
metrics = alert_context.get("metrics", {})
|
||||
labels = alert_context.get("labels", {})
|
||||
alertname = labels.get("alertname", alert_type)
|
||||
|
||||
# Phase 18.1: 正規化資源名稱 (ADR-016)
|
||||
# 確保 kubectl 指令使用有效的 K8s 名稱
|
||||
@@ -622,7 +624,75 @@ class OpenClawService:
|
||||
|
||||
# 根據告警類型生成專業 RCA + 仲裁
|
||||
# 🔴 2026-03-29 ogt: Mock 響應必須標示為規則匹配,不是 AI 仲裁
|
||||
if "oom" in message.lower() or "memory" in alert_type.lower():
|
||||
# 2026-04-09 ogt: 新增 DockerContainerUnhealthy / TargetDown 專屬規則
|
||||
if alertname == "DockerContainerUnhealthy" or "unhealthy" in message.lower():
|
||||
container_name = labels.get("name", target)
|
||||
host = labels.get("instance", "").split(":")[0] or "192.168.0.188"
|
||||
mock_response = {
|
||||
"action_title": f"檢查 Docker 容器 {container_name} 健康狀態",
|
||||
"description": f"⚙️ 規則匹配: Docker 容器 {container_name} ({host}) healthcheck 失敗。常見原因: 應用程式啟動慢、healthcheck 指令錯誤、依賴服務未就緒。{signoz_summary}",
|
||||
"suggested_action": "RESTART_DEPLOYMENT",
|
||||
"kubectl_command": f"ssh {host} 'docker inspect {container_name} --format=\"{{{{.State.Health.Status}}}}\" && docker restart {container_name}'",
|
||||
"target_resource": container_name,
|
||||
"namespace": namespace,
|
||||
"risk_level": "medium",
|
||||
"blast_radius": {
|
||||
"affected_pods": 1,
|
||||
"estimated_downtime": "~30s",
|
||||
"related_services": [container_name],
|
||||
"data_impact": "NONE",
|
||||
},
|
||||
"primary_responsibility": "INFRA",
|
||||
"responsibility_reasoning": "Docker 容器健康檢查失敗屬基礎設施團隊責任,需確認 healthcheck 設定與容器狀態",
|
||||
"secondary_teams": ["BE"],
|
||||
"optimization_suggestions": [
|
||||
{
|
||||
"type": "HEALTHCHECK",
|
||||
"description": "確認 healthcheck 指令在容器內可執行 (mc/curl 是否存在)",
|
||||
"kubectl_or_config": f"ssh {host} 'docker exec {container_name} sh -c \"mc ready local || curl -f http://localhost:9000/minio/health/live\"'",
|
||||
}
|
||||
],
|
||||
"reasoning": f"[規則匹配] Docker healthcheck 失敗先 restart 恢復服務,同時確認 healthcheck 指令正確。{signoz_correlation}",
|
||||
"deviation_analysis": "容器健康檢查連續失敗,超出允許次數",
|
||||
"confidence": 0.0,
|
||||
"affected_services": [container_name],
|
||||
"signoz_correlation": signoz_correlation,
|
||||
}
|
||||
elif alertname == "TargetDown" or (labels.get("job") and ":" in raw_target):
|
||||
instance = labels.get("instance", raw_target)
|
||||
job = labels.get("job", "exporter")
|
||||
host = instance.split(":")[0]
|
||||
mock_response = {
|
||||
"action_title": f"檢查 {job} ({instance}) 服務存活",
|
||||
"description": f"⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。{signoz_summary}",
|
||||
"suggested_action": "RESTART_DEPLOYMENT",
|
||||
"kubectl_command": f"ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'",
|
||||
"target_resource": instance,
|
||||
"namespace": namespace,
|
||||
"risk_level": "medium",
|
||||
"blast_radius": {
|
||||
"affected_pods": 0,
|
||||
"estimated_downtime": "監控盲區持續中",
|
||||
"related_services": [job],
|
||||
"data_impact": "NONE",
|
||||
},
|
||||
"primary_responsibility": "INFRA",
|
||||
"responsibility_reasoning": "Prometheus scrape 目標下線屬基礎設施監控範疇",
|
||||
"secondary_teams": [],
|
||||
"optimization_suggestions": [
|
||||
{
|
||||
"type": "MONITORING",
|
||||
"description": f"確認 {host} 上的 {job} exporter 是否正常運行",
|
||||
"kubectl_or_config": f"ssh {host} 'ps aux | grep exporter'",
|
||||
}
|
||||
],
|
||||
"reasoning": f"[規則匹配] Prometheus target 下線,先 SSH 確認主機存活再重啟 exporter。{signoz_correlation}",
|
||||
"deviation_analysis": "Prometheus scrape 失敗,監控數據中斷",
|
||||
"confidence": 0.0,
|
||||
"affected_services": [instance],
|
||||
"signoz_correlation": signoz_correlation,
|
||||
}
|
||||
elif "oom" in message.lower() or "memory" in alert_type.lower():
|
||||
mock_response = {
|
||||
"action_title": f"刪除異常 Pod {target} (OOMKilled)",
|
||||
"description": f"⚙️ 規則匹配: {target} 發生 OOMKilled,根因為 JVM Heap 配置與 K8s memory limit 不匹配或存在記憶體洩漏。{signoz_summary}",
|
||||
|
||||
Reference in New Issue
Block a user