fix host alert auto-repair routing and backup false positives

This commit is contained in:
Your Name
2026-05-02 18:35:26 +08:00
parent 68e182381f
commit b371edb70c
4 changed files with 78 additions and 68 deletions

View File

@@ -668,7 +668,8 @@ rules:
action_title: "🔍 備份失敗自動診斷 — SSH 收集備份與磁碟狀態"
description: "⚠️ 備份任務失敗。先自動 SSH 收集 backup log、last_success 與磁碟空間;若無法確認安全修復,立即升級緊急介入。"
suggested_action: SSH_DIAGNOSE
kubectl_command: "ssh {host} 'echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"
# 2026-05-02 ogt + Claude Sonnet 4.6: 補上 ps aux 讓 _ssh_execute 走 diagnostics 路徑(無阻擋)
kubectl_command: "ssh {host} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"
estimated_downtime: "N/A"
risk: low
responsibility: INFRA

View File

@@ -1751,7 +1751,70 @@ class DecisionManager:
執行後發 Telegram 結果通知 (統帥要求: 修復結果對應同一告警)
2026-04-09 Claude Sonnet 4.6 Asia/Taipei
"""
action = token.proposal_data.get("kubectl_command", "")
action = token.proposal_data.get("kubectl_command", "") or token.proposal_data.get("action", "")
_alert_labels = incident.signals[0].labels if incident.signals else {}
# 2026-05-02 ogt + Claude Sonnet 4.6: YAML 是權威,先覆蓋 LLM 生成的 action
# 根因LLM/Phase2 會先產出 node-exporter/kubectl 的錯域建議,導致
# bare_metal 告警每次都被攔回人工,形成循環。
# 修法:先執行 YAML match_rule 對齊,若 YAML 有可落地 command 則覆蓋。
_alertname_for_yaml = incident.signals[0].labels.get("alertname", "") if incident.signals else ""
if _alertname_for_yaml:
try:
from src.services.alert_rule_engine import match_rule as _yaml_match
_yaml_r = _yaml_match({
"labels": _alert_labels,
"alert_type": _alertname_for_yaml,
"message": "",
"target_resource": incident.affected_services[0] if incident.affected_services else "unknown",
"namespace": "awoooi-prod",
})
if _yaml_r:
if _yaml_r.get("suggested_action") == "NO_ACTION":
logger.info(
"auto_execute_yaml_no_action",
incident_id=incident.incident_id,
alertname=_alertname_for_yaml,
reason="YAML 規則明確標記 NO_ACTION不執行自動修復",
)
token.state = DecisionState.READY
token.proposal_data["auto_executed"] = False
token.proposal_data["blocked_reason"] = f"YAML: NO_ACTION for {_alertname_for_yaml}"
await self._save_token(token)
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
return
# 2026-04-16 ogt + Claude Sonnet 4.6: INVALID_TARGET → 人工確認 TYPE-4
_yaml_blocked = _yaml_r.get("blocked_reason", "")
if "INVALID_TARGET" in _yaml_blocked:
logger.warning(
"auto_execute_yaml_invalid_target",
incident_id=incident.incident_id,
alertname=_alertname_for_yaml,
blocked_reason=_yaml_blocked,
)
token.state = DecisionState.READY
token.proposal_data["auto_executed"] = False
token.proposal_data["blocked_reason"] = _yaml_blocked
await self._save_token(token)
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
return
_yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip()
if _yaml_cmd and not _yaml_cmd.startswith("kubectl"):
# YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 action
action = _yaml_cmd
token.proposal_data["action"] = action
token.proposal_data["kubectl_command"] = action
await self._save_token(token)
logger.info(
"auto_execute_yaml_cmd_override",
incident_id=incident.incident_id,
alertname=_alertname_for_yaml,
yaml_cmd=action[:80],
)
except Exception as _yaml_err:
logger.debug("auto_execute_yaml_check_error", error=str(_yaml_err))
# 2026-05-02 ogt + Claude Sonnet 4.6: bare_metal × kubectl 拒絕守衛
# 根因HostHighCpuLoad / HostOutOfMemory 等主機層告警 fire 在實體機(如 110
@@ -1760,7 +1823,6 @@ class DecisionManager:
# 重啟 awoooi 服務根本解不了第三方 CPU 燒爆,只是拖累自己。
# 修法:偵測到 alert host_type=bare_metal 且 action 是 kubectl 類,立即降級人工,
# Telegram 明示「跨 domain 動作被攔下」。auto_repair 走 SSH 診斷或人工。
_alert_labels = incident.signals[0].labels if incident.signals else {}
_host_type = (_alert_labels.get("host_type") or "").lower()
_action_stripped = action.lstrip().lower()
if _host_type == "bare_metal" and _action_stripped.startswith("kubectl"):
@@ -1789,69 +1851,6 @@ class DecisionManager:
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
return
# 2026-04-15 ogt: YAML 規則引擎優先 — 架構斷點修復
# 根因LLM 生成的 kubectl_command 與 YAML 規則引擎的 NO_ACTION / SSH 指令完全脫節
# YAML 規則是人工審閱的權威來源LLM 只是輔助
# 修復策略:
# 1. YAML → NO_ACTION → 立即返回,不執行任何操作
# 2. YAML → SSH 指令(非 kubectl→ 覆蓋 LLM 生成的 action讓 SSH 路由生效
_alertname_for_yaml = incident.signals[0].labels.get("alertname", "") if incident.signals else ""
if _alertname_for_yaml:
try:
from src.services.alert_rule_engine import match_rule as _yaml_match
_yaml_r = _yaml_match({
"labels": incident.signals[0].labels if incident.signals else {},
"alert_type": _alertname_for_yaml,
"message": "",
"target_resource": incident.affected_services[0] if incident.affected_services else "unknown",
"namespace": "awoooi-prod",
})
if _yaml_r:
if _yaml_r.get("suggested_action") == "NO_ACTION":
logger.info(
"auto_execute_yaml_no_action",
incident_id=incident.incident_id,
alertname=_alertname_for_yaml,
reason="YAML 規則明確標記 NO_ACTION不執行自動修復",
)
token.state = DecisionState.READY
token.proposal_data["auto_executed"] = False
token.proposal_data["blocked_reason"] = f"YAML: NO_ACTION for {_alertname_for_yaml}"
await self._save_token(token)
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
return
# 2026-04-16 ogt + Claude Sonnet 4.6: INVALID_TARGET → 人工確認 TYPE-4
# 根因target 無法解析 → rule engine 清空 kubectl_command + 設 blocked_reason
# 系統不應繼續嘗試執行,提早返回讓 SRE 介入
_yaml_blocked = _yaml_r.get("blocked_reason", "")
if "INVALID_TARGET" in _yaml_blocked:
logger.warning(
"auto_execute_yaml_invalid_target",
incident_id=incident.incident_id,
alertname=_alertname_for_yaml,
blocked_reason=_yaml_blocked,
)
token.state = DecisionState.READY
token.proposal_data["auto_executed"] = False
token.proposal_data["blocked_reason"] = _yaml_blocked
await self._save_token(token)
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
return
_yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip()
if _yaml_cmd and not _yaml_cmd.startswith("kubectl"):
# YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 kubectl action
action = _yaml_cmd
logger.info(
"auto_execute_yaml_cmd_override",
incident_id=incident.incident_id,
alertname=_alertname_for_yaml,
yaml_cmd=_yaml_cmd[:80],
)
except Exception as _yaml_err:
logger.debug("auto_execute_yaml_check_error", error=str(_yaml_err))
# Phase 6 ADR-087: 自我降級守衛AIOPS_P6_SELF_DEMOTION 控制)
# SLO 違反 → 全域信心閾值調高;連續違反 → 保守模式,所有自動執行降為人工
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 6 初始建立

View File

@@ -77,6 +77,7 @@ groups:
annotations:
summary: "主機 {{ $labels.host }} 磁碟空間不足"
description: "磁碟使用率超過 85%"
auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'"
# =========================================================================
# K8s 叢集告警 (kubernetes_alerts)
@@ -161,7 +162,10 @@ groups:
# backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success
# node-exporter textfile collector 讀取此檔案暴露指標
- alert: HostBackupFailed
expr: time() - node_textfile_scrape_error{collector="backup_110"} == 0 or absent(node_textfile_scrape_error{collector="backup_110"}) or (time() - backup_110_last_success_timestamp > 90000)
# 2026-05-02 ogt + Claude Sonnet 4.6: 刪除 backup_110 collector label 判斷
# 根因node_textfile_scrape_error 目前已不帶 collector 欄位,此條件在實際環境持續 absent()=true導致告警永遠成立。
# 修法:僅以 backup_110_last_success_timestamp 是否缺失 / 是否超時為主判斷。
expr: absent(backup_110_last_success_timestamp) or (time() - backup_110_last_success_timestamp > 90000)
for: 10m
labels:
severity: warning
@@ -174,6 +178,7 @@ groups:
annotations:
summary: "188 Host 備份超過 25 小時未成功"
description: "backup-from-110.sh 可能失敗,/backup/110 資料可能過舊"
auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"
# ADR-075: CoreDNS 解析失敗 (2026-04-12 ogt)
- alert: CoreDNSResolutionFailed

View File

@@ -83,6 +83,7 @@ groups:
annotations:
summary: "主機 {{ $labels.host }} 磁碟空間不足"
description: "磁碟使用率超過 85%"
auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'"
# =========================================================================
# K8s 叢集告警 (kubernetes_alerts)
@@ -167,7 +168,10 @@ groups:
# backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success
# node-exporter textfile collector 讀取此檔案暴露指標
- alert: HostBackupFailed
expr: time() - node_textfile_scrape_error{collector="backup_110"} == 0 or absent(node_textfile_scrape_error{collector="backup_110"}) or (time() - backup_110_last_success_timestamp > 90000)
# 2026-05-02 ogt + Claude Sonnet 4.6: 刪除 backup_110 collector 標籤條件
# 根因node_textfile_scrape_error 已移除 collector 欄位,原條件一直判斷為 absent造成錯誤告警。
# 修法:以 backup_110_last_success_timestamp 是否缺失/超時判斷。
expr: absent(backup_110_last_success_timestamp) or (time() - backup_110_last_success_timestamp > 90000)
for: 10m
labels:
severity: warning
@@ -180,6 +184,7 @@ groups:
annotations:
summary: "188 Host 備份超過 25 小時未成功"
description: "backup-from-110.sh 可能失敗,/backup/110 資料可能過舊"
auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"
# ADR-075: CoreDNS 解析失敗 (2026-04-12 ogt)
- alert: CoreDNSResolutionFailed