fix host alert auto-repair routing and backup false positives
This commit is contained in:
@@ -668,7 +668,8 @@ rules:
|
||||
action_title: "🔍 備份失敗自動診斷 — SSH 收集備份與磁碟狀態"
|
||||
description: "⚠️ 備份任務失敗。先自動 SSH 收集 backup log、last_success 與磁碟空間;若無法確認安全修復,立即升級緊急介入。"
|
||||
suggested_action: SSH_DIAGNOSE
|
||||
kubectl_command: "ssh {host} 'echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"
|
||||
# 2026-05-02 ogt + Claude Sonnet 4.6: 補上 ps aux 讓 _ssh_execute 走 diagnostics 路徑(無阻擋)
|
||||
kubectl_command: "ssh {host} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"
|
||||
estimated_downtime: "N/A"
|
||||
risk: low
|
||||
responsibility: INFRA
|
||||
|
||||
@@ -1751,7 +1751,70 @@ class DecisionManager:
|
||||
執行後發 Telegram 結果通知 (統帥要求: 修復結果對應同一告警)
|
||||
2026-04-09 Claude Sonnet 4.6 Asia/Taipei
|
||||
"""
|
||||
action = token.proposal_data.get("kubectl_command", "")
|
||||
action = token.proposal_data.get("kubectl_command", "") or token.proposal_data.get("action", "")
|
||||
_alert_labels = incident.signals[0].labels if incident.signals else {}
|
||||
|
||||
# 2026-05-02 ogt + Claude Sonnet 4.6: YAML 是權威,先覆蓋 LLM 生成的 action
|
||||
# 根因:LLM/Phase2 會先產出 node-exporter/kubectl 的錯域建議,導致
|
||||
# bare_metal 告警每次都被攔回人工,形成循環。
|
||||
# 修法:先執行 YAML match_rule 對齊,若 YAML 有可落地 command 則覆蓋。
|
||||
_alertname_for_yaml = incident.signals[0].labels.get("alertname", "") if incident.signals else ""
|
||||
if _alertname_for_yaml:
|
||||
try:
|
||||
from src.services.alert_rule_engine import match_rule as _yaml_match
|
||||
_yaml_r = _yaml_match({
|
||||
"labels": _alert_labels,
|
||||
"alert_type": _alertname_for_yaml,
|
||||
"message": "",
|
||||
"target_resource": incident.affected_services[0] if incident.affected_services else "unknown",
|
||||
"namespace": "awoooi-prod",
|
||||
})
|
||||
if _yaml_r:
|
||||
if _yaml_r.get("suggested_action") == "NO_ACTION":
|
||||
logger.info(
|
||||
"auto_execute_yaml_no_action",
|
||||
incident_id=incident.incident_id,
|
||||
alertname=_alertname_for_yaml,
|
||||
reason="YAML 規則明確標記 NO_ACTION,不執行自動修復",
|
||||
)
|
||||
token.state = DecisionState.READY
|
||||
token.proposal_data["auto_executed"] = False
|
||||
token.proposal_data["blocked_reason"] = f"YAML: NO_ACTION for {_alertname_for_yaml}"
|
||||
await self._save_token(token)
|
||||
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
|
||||
return
|
||||
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: INVALID_TARGET → 人工確認 TYPE-4
|
||||
_yaml_blocked = _yaml_r.get("blocked_reason", "")
|
||||
if "INVALID_TARGET" in _yaml_blocked:
|
||||
logger.warning(
|
||||
"auto_execute_yaml_invalid_target",
|
||||
incident_id=incident.incident_id,
|
||||
alertname=_alertname_for_yaml,
|
||||
blocked_reason=_yaml_blocked,
|
||||
)
|
||||
token.state = DecisionState.READY
|
||||
token.proposal_data["auto_executed"] = False
|
||||
token.proposal_data["blocked_reason"] = _yaml_blocked
|
||||
await self._save_token(token)
|
||||
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
|
||||
return
|
||||
|
||||
_yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip()
|
||||
if _yaml_cmd and not _yaml_cmd.startswith("kubectl"):
|
||||
# YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 action
|
||||
action = _yaml_cmd
|
||||
token.proposal_data["action"] = action
|
||||
token.proposal_data["kubectl_command"] = action
|
||||
await self._save_token(token)
|
||||
logger.info(
|
||||
"auto_execute_yaml_cmd_override",
|
||||
incident_id=incident.incident_id,
|
||||
alertname=_alertname_for_yaml,
|
||||
yaml_cmd=action[:80],
|
||||
)
|
||||
except Exception as _yaml_err:
|
||||
logger.debug("auto_execute_yaml_check_error", error=str(_yaml_err))
|
||||
|
||||
# 2026-05-02 ogt + Claude Sonnet 4.6: bare_metal × kubectl 拒絕守衛
|
||||
# 根因:HostHighCpuLoad / HostOutOfMemory 等主機層告警 fire 在實體機(如 110,
|
||||
@@ -1760,7 +1823,6 @@ class DecisionManager:
|
||||
# 重啟 awoooi 服務根本解不了第三方 CPU 燒爆,只是拖累自己。
|
||||
# 修法:偵測到 alert host_type=bare_metal 且 action 是 kubectl 類,立即降級人工,
|
||||
# Telegram 明示「跨 domain 動作被攔下」。auto_repair 走 SSH 診斷或人工。
|
||||
_alert_labels = incident.signals[0].labels if incident.signals else {}
|
||||
_host_type = (_alert_labels.get("host_type") or "").lower()
|
||||
_action_stripped = action.lstrip().lower()
|
||||
if _host_type == "bare_metal" and _action_stripped.startswith("kubectl"):
|
||||
@@ -1789,69 +1851,6 @@ class DecisionManager:
|
||||
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
|
||||
return
|
||||
|
||||
# 2026-04-15 ogt: YAML 規則引擎優先 — 架構斷點修復
|
||||
# 根因:LLM 生成的 kubectl_command 與 YAML 規則引擎的 NO_ACTION / SSH 指令完全脫節
|
||||
# YAML 規則是人工審閱的權威來源,LLM 只是輔助
|
||||
# 修復策略:
|
||||
# 1. YAML → NO_ACTION → 立即返回,不執行任何操作
|
||||
# 2. YAML → SSH 指令(非 kubectl)→ 覆蓋 LLM 生成的 action,讓 SSH 路由生效
|
||||
_alertname_for_yaml = incident.signals[0].labels.get("alertname", "") if incident.signals else ""
|
||||
if _alertname_for_yaml:
|
||||
try:
|
||||
from src.services.alert_rule_engine import match_rule as _yaml_match
|
||||
_yaml_r = _yaml_match({
|
||||
"labels": incident.signals[0].labels if incident.signals else {},
|
||||
"alert_type": _alertname_for_yaml,
|
||||
"message": "",
|
||||
"target_resource": incident.affected_services[0] if incident.affected_services else "unknown",
|
||||
"namespace": "awoooi-prod",
|
||||
})
|
||||
if _yaml_r:
|
||||
if _yaml_r.get("suggested_action") == "NO_ACTION":
|
||||
logger.info(
|
||||
"auto_execute_yaml_no_action",
|
||||
incident_id=incident.incident_id,
|
||||
alertname=_alertname_for_yaml,
|
||||
reason="YAML 規則明確標記 NO_ACTION,不執行自動修復",
|
||||
)
|
||||
token.state = DecisionState.READY
|
||||
token.proposal_data["auto_executed"] = False
|
||||
token.proposal_data["blocked_reason"] = f"YAML: NO_ACTION for {_alertname_for_yaml}"
|
||||
await self._save_token(token)
|
||||
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
|
||||
return
|
||||
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: INVALID_TARGET → 人工確認 TYPE-4
|
||||
# 根因:target 無法解析 → rule engine 清空 kubectl_command + 設 blocked_reason
|
||||
# 系統不應繼續嘗試執行,提早返回讓 SRE 介入
|
||||
_yaml_blocked = _yaml_r.get("blocked_reason", "")
|
||||
if "INVALID_TARGET" in _yaml_blocked:
|
||||
logger.warning(
|
||||
"auto_execute_yaml_invalid_target",
|
||||
incident_id=incident.incident_id,
|
||||
alertname=_alertname_for_yaml,
|
||||
blocked_reason=_yaml_blocked,
|
||||
)
|
||||
token.state = DecisionState.READY
|
||||
token.proposal_data["auto_executed"] = False
|
||||
token.proposal_data["blocked_reason"] = _yaml_blocked
|
||||
await self._save_token(token)
|
||||
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
|
||||
return
|
||||
|
||||
_yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip()
|
||||
if _yaml_cmd and not _yaml_cmd.startswith("kubectl"):
|
||||
# YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 kubectl action
|
||||
action = _yaml_cmd
|
||||
logger.info(
|
||||
"auto_execute_yaml_cmd_override",
|
||||
incident_id=incident.incident_id,
|
||||
alertname=_alertname_for_yaml,
|
||||
yaml_cmd=_yaml_cmd[:80],
|
||||
)
|
||||
except Exception as _yaml_err:
|
||||
logger.debug("auto_execute_yaml_check_error", error=str(_yaml_err))
|
||||
|
||||
# Phase 6 ADR-087: 自我降級守衛(AIOPS_P6_SELF_DEMOTION 控制)
|
||||
# SLO 違反 → 全域信心閾值調高;連續違反 → 保守模式,所有自動執行降為人工
|
||||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 6 初始建立
|
||||
|
||||
@@ -77,6 +77,7 @@ groups:
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} 磁碟空間不足"
|
||||
description: "磁碟使用率超過 85%"
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'"
|
||||
|
||||
# =========================================================================
|
||||
# K8s 叢集告警 (kubernetes_alerts)
|
||||
@@ -161,7 +162,10 @@ groups:
|
||||
# backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success
|
||||
# node-exporter textfile collector 讀取此檔案暴露指標
|
||||
- alert: HostBackupFailed
|
||||
expr: time() - node_textfile_scrape_error{collector="backup_110"} == 0 or absent(node_textfile_scrape_error{collector="backup_110"}) or (time() - backup_110_last_success_timestamp > 90000)
|
||||
# 2026-05-02 ogt + Claude Sonnet 4.6: 刪除 backup_110 collector label 判斷
|
||||
# 根因:node_textfile_scrape_error 目前已不帶 collector 欄位,此條件在實際環境持續 absent()=true,導致告警永遠成立。
|
||||
# 修法:僅以 backup_110_last_success_timestamp 是否缺失 / 是否超時為主判斷。
|
||||
expr: absent(backup_110_last_success_timestamp) or (time() - backup_110_last_success_timestamp > 90000)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -174,6 +178,7 @@ groups:
|
||||
annotations:
|
||||
summary: "188 Host 備份超過 25 小時未成功"
|
||||
description: "backup-from-110.sh 可能失敗,/backup/110 資料可能過舊"
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"
|
||||
|
||||
# ADR-075: CoreDNS 解析失敗 (2026-04-12 ogt)
|
||||
- alert: CoreDNSResolutionFailed
|
||||
|
||||
@@ -83,6 +83,7 @@ groups:
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} 磁碟空間不足"
|
||||
description: "磁碟使用率超過 85%"
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'"
|
||||
|
||||
# =========================================================================
|
||||
# K8s 叢集告警 (kubernetes_alerts)
|
||||
@@ -167,7 +168,10 @@ groups:
|
||||
# backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success
|
||||
# node-exporter textfile collector 讀取此檔案暴露指標
|
||||
- alert: HostBackupFailed
|
||||
expr: time() - node_textfile_scrape_error{collector="backup_110"} == 0 or absent(node_textfile_scrape_error{collector="backup_110"}) or (time() - backup_110_last_success_timestamp > 90000)
|
||||
# 2026-05-02 ogt + Claude Sonnet 4.6: 刪除 backup_110 collector 標籤條件
|
||||
# 根因:node_textfile_scrape_error 已移除 collector 欄位,原條件一直判斷為 absent,造成錯誤告警。
|
||||
# 修法:以 backup_110_last_success_timestamp 是否缺失/超時判斷。
|
||||
expr: absent(backup_110_last_success_timestamp) or (time() - backup_110_last_success_timestamp > 90000)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -180,6 +184,7 @@ groups:
|
||||
annotations:
|
||||
summary: "188 Host 備份超過 25 小時未成功"
|
||||
description: "backup-from-110.sh 可能失敗,/backup/110 資料可能過舊"
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"
|
||||
|
||||
# ADR-075: CoreDNS 解析失敗 (2026-04-12 ogt)
|
||||
- alert: CoreDNSResolutionFailed
|
||||
|
||||
Reference in New Issue
Block a user