From b371edb70ce290028e6395f4fc46cff77836f4a7 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 2 May 2026 18:35:26 +0800 Subject: [PATCH] fix host alert auto-repair routing and backup false positives --- apps/api/alert_rules.yaml | 3 +- apps/api/src/services/decision_manager.py | 129 +++++++++++----------- ops/monitoring/alerts-unified.yml | 7 +- ops/monitoring/alerts.yml | 7 +- 4 files changed, 78 insertions(+), 68 deletions(-) diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml index 53b97b20..25ccd552 100644 --- a/apps/api/alert_rules.yaml +++ b/apps/api/alert_rules.yaml @@ -668,7 +668,8 @@ rules: action_title: "🔍 備份失敗自動診斷 — SSH 收集備份與磁碟狀態" description: "⚠️ 備份任務失敗。先自動 SSH 收集 backup log、last_success 與磁碟空間;若無法確認安全修復,立即升級緊急介入。" suggested_action: SSH_DIAGNOSE - kubectl_command: "ssh {host} 'echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'" + # 2026-05-02 ogt + Claude Sonnet 4.6: 補上 ps aux 讓 _ssh_execute 走 diagnostics 路徑(無阻擋) + kubectl_command: "ssh {host} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'" estimated_downtime: "N/A" risk: low responsibility: INFRA diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 8d7dd87b..e65b660a 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1751,7 +1751,70 @@ class DecisionManager: 執行後發 Telegram 結果通知 (統帥要求: 修復結果對應同一告警) 2026-04-09 Claude Sonnet 4.6 Asia/Taipei """ - action = token.proposal_data.get("kubectl_command", "") + action = token.proposal_data.get("kubectl_command", "") or token.proposal_data.get("action", "") + _alert_labels = incident.signals[0].labels if incident.signals else {} + + # 2026-05-02 ogt + Claude Sonnet 4.6: YAML 是權威,先覆蓋 LLM 生成的 action + # 根因:LLM/Phase2 會先產出 node-exporter/kubectl 的錯域建議,導致 + # bare_metal 告警每次都被攔回人工,形成循環。 + # 修法:先執行 YAML match_rule 對齊,若 YAML 有可落地 command 則覆蓋。 + _alertname_for_yaml = incident.signals[0].labels.get("alertname", "") if incident.signals else "" + if _alertname_for_yaml: + try: + from src.services.alert_rule_engine import match_rule as _yaml_match + _yaml_r = _yaml_match({ + "labels": _alert_labels, + "alert_type": _alertname_for_yaml, + "message": "", + "target_resource": incident.affected_services[0] if incident.affected_services else "unknown", + "namespace": "awoooi-prod", + }) + if _yaml_r: + if _yaml_r.get("suggested_action") == "NO_ACTION": + logger.info( + "auto_execute_yaml_no_action", + incident_id=incident.incident_id, + alertname=_alertname_for_yaml, + reason="YAML 規則明確標記 NO_ACTION,不執行自動修復", + ) + token.state = DecisionState.READY + token.proposal_data["auto_executed"] = False + token.proposal_data["blocked_reason"] = f"YAML: NO_ACTION for {_alertname_for_yaml}" + await self._save_token(token) + _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data)) + return + + # 2026-04-16 ogt + Claude Sonnet 4.6: INVALID_TARGET → 人工確認 TYPE-4 + _yaml_blocked = _yaml_r.get("blocked_reason", "") + if "INVALID_TARGET" in _yaml_blocked: + logger.warning( + "auto_execute_yaml_invalid_target", + incident_id=incident.incident_id, + alertname=_alertname_for_yaml, + blocked_reason=_yaml_blocked, + ) + token.state = DecisionState.READY + token.proposal_data["auto_executed"] = False + token.proposal_data["blocked_reason"] = _yaml_blocked + await self._save_token(token) + _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data)) + return + + _yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip() + if _yaml_cmd and not _yaml_cmd.startswith("kubectl"): + # YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 action + action = _yaml_cmd + token.proposal_data["action"] = action + token.proposal_data["kubectl_command"] = action + await self._save_token(token) + logger.info( + "auto_execute_yaml_cmd_override", + incident_id=incident.incident_id, + alertname=_alertname_for_yaml, + yaml_cmd=action[:80], + ) + except Exception as _yaml_err: + logger.debug("auto_execute_yaml_check_error", error=str(_yaml_err)) # 2026-05-02 ogt + Claude Sonnet 4.6: bare_metal × kubectl 拒絕守衛 # 根因:HostHighCpuLoad / HostOutOfMemory 等主機層告警 fire 在實體機(如 110, @@ -1760,7 +1823,6 @@ class DecisionManager: # 重啟 awoooi 服務根本解不了第三方 CPU 燒爆,只是拖累自己。 # 修法:偵測到 alert host_type=bare_metal 且 action 是 kubectl 類,立即降級人工, # Telegram 明示「跨 domain 動作被攔下」。auto_repair 走 SSH 診斷或人工。 - _alert_labels = incident.signals[0].labels if incident.signals else {} _host_type = (_alert_labels.get("host_type") or "").lower() _action_stripped = action.lstrip().lower() if _host_type == "bare_metal" and _action_stripped.startswith("kubectl"): @@ -1789,69 +1851,6 @@ class DecisionManager: _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data)) return - # 2026-04-15 ogt: YAML 規則引擎優先 — 架構斷點修復 - # 根因:LLM 生成的 kubectl_command 與 YAML 規則引擎的 NO_ACTION / SSH 指令完全脫節 - # YAML 規則是人工審閱的權威來源,LLM 只是輔助 - # 修復策略: - # 1. YAML → NO_ACTION → 立即返回,不執行任何操作 - # 2. YAML → SSH 指令(非 kubectl)→ 覆蓋 LLM 生成的 action,讓 SSH 路由生效 - _alertname_for_yaml = incident.signals[0].labels.get("alertname", "") if incident.signals else "" - if _alertname_for_yaml: - try: - from src.services.alert_rule_engine import match_rule as _yaml_match - _yaml_r = _yaml_match({ - "labels": incident.signals[0].labels if incident.signals else {}, - "alert_type": _alertname_for_yaml, - "message": "", - "target_resource": incident.affected_services[0] if incident.affected_services else "unknown", - "namespace": "awoooi-prod", - }) - if _yaml_r: - if _yaml_r.get("suggested_action") == "NO_ACTION": - logger.info( - "auto_execute_yaml_no_action", - incident_id=incident.incident_id, - alertname=_alertname_for_yaml, - reason="YAML 規則明確標記 NO_ACTION,不執行自動修復", - ) - token.state = DecisionState.READY - token.proposal_data["auto_executed"] = False - token.proposal_data["blocked_reason"] = f"YAML: NO_ACTION for {_alertname_for_yaml}" - await self._save_token(token) - _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data)) - return - - # 2026-04-16 ogt + Claude Sonnet 4.6: INVALID_TARGET → 人工確認 TYPE-4 - # 根因:target 無法解析 → rule engine 清空 kubectl_command + 設 blocked_reason - # 系統不應繼續嘗試執行,提早返回讓 SRE 介入 - _yaml_blocked = _yaml_r.get("blocked_reason", "") - if "INVALID_TARGET" in _yaml_blocked: - logger.warning( - "auto_execute_yaml_invalid_target", - incident_id=incident.incident_id, - alertname=_alertname_for_yaml, - blocked_reason=_yaml_blocked, - ) - token.state = DecisionState.READY - token.proposal_data["auto_executed"] = False - token.proposal_data["blocked_reason"] = _yaml_blocked - await self._save_token(token) - _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data)) - return - - _yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip() - if _yaml_cmd and not _yaml_cmd.startswith("kubectl"): - # YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 kubectl action - action = _yaml_cmd - logger.info( - "auto_execute_yaml_cmd_override", - incident_id=incident.incident_id, - alertname=_alertname_for_yaml, - yaml_cmd=_yaml_cmd[:80], - ) - except Exception as _yaml_err: - logger.debug("auto_execute_yaml_check_error", error=str(_yaml_err)) - # Phase 6 ADR-087: 自我降級守衛(AIOPS_P6_SELF_DEMOTION 控制) # SLO 違反 → 全域信心閾值調高;連續違反 → 保守模式,所有自動執行降為人工 # 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 6 初始建立 diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index d4e3db3a..ed665e09 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -77,6 +77,7 @@ groups: annotations: summary: "主機 {{ $labels.host }} 磁碟空間不足" description: "磁碟使用率超過 85%" + auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'" # ========================================================================= # K8s 叢集告警 (kubernetes_alerts) @@ -161,7 +162,10 @@ groups: # backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success # node-exporter textfile collector 讀取此檔案暴露指標 - alert: HostBackupFailed - expr: time() - node_textfile_scrape_error{collector="backup_110"} == 0 or absent(node_textfile_scrape_error{collector="backup_110"}) or (time() - backup_110_last_success_timestamp > 90000) + # 2026-05-02 ogt + Claude Sonnet 4.6: 刪除 backup_110 collector label 判斷 + # 根因:node_textfile_scrape_error 目前已不帶 collector 欄位,此條件在實際環境持續 absent()=true,導致告警永遠成立。 + # 修法:僅以 backup_110_last_success_timestamp 是否缺失 / 是否超時為主判斷。 + expr: absent(backup_110_last_success_timestamp) or (time() - backup_110_last_success_timestamp > 90000) for: 10m labels: severity: warning @@ -174,6 +178,7 @@ groups: annotations: summary: "188 Host 備份超過 25 小時未成功" description: "backup-from-110.sh 可能失敗,/backup/110 資料可能過舊" + auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'" # ADR-075: CoreDNS 解析失敗 (2026-04-12 ogt) - alert: CoreDNSResolutionFailed diff --git a/ops/monitoring/alerts.yml b/ops/monitoring/alerts.yml index d5e39905..da18aff2 100644 --- a/ops/monitoring/alerts.yml +++ b/ops/monitoring/alerts.yml @@ -83,6 +83,7 @@ groups: annotations: summary: "主機 {{ $labels.host }} 磁碟空間不足" description: "磁碟使用率超過 85%" + auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'" # ========================================================================= # K8s 叢集告警 (kubernetes_alerts) @@ -167,7 +168,10 @@ groups: # backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success # node-exporter textfile collector 讀取此檔案暴露指標 - alert: HostBackupFailed - expr: time() - node_textfile_scrape_error{collector="backup_110"} == 0 or absent(node_textfile_scrape_error{collector="backup_110"}) or (time() - backup_110_last_success_timestamp > 90000) + # 2026-05-02 ogt + Claude Sonnet 4.6: 刪除 backup_110 collector 標籤條件 + # 根因:node_textfile_scrape_error 已移除 collector 欄位,原條件一直判斷為 absent,造成錯誤告警。 + # 修法:以 backup_110_last_success_timestamp 是否缺失/超時判斷。 + expr: absent(backup_110_last_success_timestamp) or (time() - backup_110_last_success_timestamp > 90000) for: 10m labels: severity: warning @@ -180,6 +184,7 @@ groups: annotations: summary: "188 Host 備份超過 25 小時未成功" description: "backup-from-110.sh 可能失敗,/backup/110 資料可能過舊" + auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'" # ADR-075: CoreDNS 解析失敗 (2026-04-12 ogt) - alert: CoreDNSResolutionFailed