fix host alert auto-repair routing and backup false positives

2026-05-02 18:35:26 +08:00
parent 68e182381f
commit b371edb70c
4 changed files with 78 additions and 68 deletions
--- a/apps/api/alert_rules.yaml
+++ b/apps/api/alert_rules.yaml
@@ -668,7 +668,8 @@ rules:
      action_title: "🔍 備份失敗自動診斷 — SSH 收集備份與磁碟狀態"
      description: "⚠️ 備份任務失敗。先自動 SSH 收集 backup log、last_success 與磁碟空間；若無法確認安全修復，立即升級緊急介入。"
      suggested_action: SSH_DIAGNOSE
-      kubectl_command: "ssh {host} 'echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"
+      # 2026-05-02 ogt + Claude Sonnet 4.6: 補上 ps aux 讓 _ssh_execute 走 diagnostics 路徑（無阻擋）
+      kubectl_command: "ssh {host} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"
      estimated_downtime: "N/A"
      risk: low
      responsibility: INFRA
--- a/apps/api/src/services/decision_manager.py
+++ b/apps/api/src/services/decision_manager.py
@@ -1751,7 +1751,70 @@ class DecisionManager:
        執行後發 Telegram 結果通知 (統帥要求: 修復結果對應同一告警)
        2026-04-09 Claude Sonnet 4.6 Asia/Taipei
        """
-        action = token.proposal_data.get("kubectl_command", "")
+        action = token.proposal_data.get("kubectl_command", "") or token.proposal_data.get("action", "")
+        _alert_labels = incident.signals[0].labels if incident.signals else {}
+
+        # 2026-05-02 ogt + Claude Sonnet 4.6: YAML 是權威，先覆蓋 LLM 生成的 action
+        # 根因：LLM/Phase2 會先產出 node-exporter/kubectl 的錯域建議，導致
+        #       bare_metal 告警每次都被攔回人工，形成循環。
+        # 修法：先執行 YAML match_rule 對齊，若 YAML 有可落地 command 則覆蓋。
+        _alertname_for_yaml = incident.signals[0].labels.get("alertname", "") if incident.signals else ""
+        if _alertname_for_yaml:
+            try:
+                from src.services.alert_rule_engine import match_rule as _yaml_match
+                _yaml_r = _yaml_match({
+                    "labels": _alert_labels,
+                    "alert_type": _alertname_for_yaml,
+                    "message": "",
+                    "target_resource": incident.affected_services[0] if incident.affected_services else "unknown",
+                    "namespace": "awoooi-prod",
+                })
+                if _yaml_r:
+                    if _yaml_r.get("suggested_action") == "NO_ACTION":
+                        logger.info(
+                            "auto_execute_yaml_no_action",
+                            incident_id=incident.incident_id,
+                            alertname=_alertname_for_yaml,
+                            reason="YAML 規則明確標記 NO_ACTION，不執行自動修復",
+                        )
+                        token.state = DecisionState.READY
+                        token.proposal_data["auto_executed"] = False
+                        token.proposal_data["blocked_reason"] = f"YAML: NO_ACTION for {_alertname_for_yaml}"
+                        await self._save_token(token)
+                        _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
+                        return
+
+                    # 2026-04-16 ogt + Claude Sonnet 4.6: INVALID_TARGET → 人工確認 TYPE-4
+                    _yaml_blocked = _yaml_r.get("blocked_reason", "")
+                    if "INVALID_TARGET" in _yaml_blocked:
+                        logger.warning(
+                            "auto_execute_yaml_invalid_target",
+                            incident_id=incident.incident_id,
+                            alertname=_alertname_for_yaml,
+                            blocked_reason=_yaml_blocked,
+                        )
+                        token.state = DecisionState.READY
+                        token.proposal_data["auto_executed"] = False
+                        token.proposal_data["blocked_reason"] = _yaml_blocked
+                        await self._save_token(token)
+                        _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
+                        return
+
+                    _yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip()
+                    if _yaml_cmd and not _yaml_cmd.startswith("kubectl"):
+                        # YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 action
+                        action = _yaml_cmd
+                        token.proposal_data["action"] = action
+                        token.proposal_data["kubectl_command"] = action
+                        await self._save_token(token)
+                        logger.info(
+                            "auto_execute_yaml_cmd_override",
+                            incident_id=incident.incident_id,
+                            alertname=_alertname_for_yaml,
+                            yaml_cmd=action[:80],
+                        )
+            except Exception as _yaml_err:
+                logger.debug("auto_execute_yaml_check_error", error=str(_yaml_err))

        # 2026-05-02 ogt + Claude Sonnet 4.6: bare_metal × kubectl 拒絕守衛
        # 根因：HostHighCpuLoad / HostOutOfMemory 等主機層告警 fire 在實體機（如 110，
@@ -1760,7 +1823,6 @@ class DecisionManager:
        #       重啟 awoooi 服務根本解不了第三方 CPU 燒爆，只是拖累自己。
        # 修法：偵測到 alert host_type=bare_metal 且 action 是 kubectl 類，立即降級人工，
        #       Telegram 明示「跨 domain 動作被攔下」。auto_repair 走 SSH 診斷或人工。
-        _alert_labels = incident.signals[0].labels if incident.signals else {}
        _host_type = (_alert_labels.get("host_type") or "").lower()
        _action_stripped = action.lstrip().lower()
        if _host_type == "bare_metal" and _action_stripped.startswith("kubectl"):
@@ -1789,69 +1851,6 @@ class DecisionManager:
            _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
            return

-        # 2026-04-15 ogt: YAML 規則引擎優先 — 架構斷點修復
-        # 根因：LLM 生成的 kubectl_command 與 YAML 規則引擎的 NO_ACTION / SSH 指令完全脫節
-        # YAML 規則是人工審閱的權威來源，LLM 只是輔助
-        # 修復策略：
-        #   1. YAML → NO_ACTION → 立即返回，不執行任何操作
-        #   2. YAML → SSH 指令（非 kubectl）→ 覆蓋 LLM 生成的 action，讓 SSH 路由生效
-        _alertname_for_yaml = incident.signals[0].labels.get("alertname", "") if incident.signals else ""
-        if _alertname_for_yaml:
-            try:
-                from src.services.alert_rule_engine import match_rule as _yaml_match
-                _yaml_r = _yaml_match({
-                    "labels": incident.signals[0].labels if incident.signals else {},
-                    "alert_type": _alertname_for_yaml,
-                    "message": "",
-                    "target_resource": incident.affected_services[0] if incident.affected_services else "unknown",
-                    "namespace": "awoooi-prod",
-                })
-                if _yaml_r:
-                    if _yaml_r.get("suggested_action") == "NO_ACTION":
-                        logger.info(
-                            "auto_execute_yaml_no_action",
-                            incident_id=incident.incident_id,
-                            alertname=_alertname_for_yaml,
-                            reason="YAML 規則明確標記 NO_ACTION，不執行自動修復",
-                        )
-                        token.state = DecisionState.READY
-                        token.proposal_data["auto_executed"] = False
-                        token.proposal_data["blocked_reason"] = f"YAML: NO_ACTION for {_alertname_for_yaml}"
-                        await self._save_token(token)
-                        _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
-                        return
-
-                    # 2026-04-16 ogt + Claude Sonnet 4.6: INVALID_TARGET → 人工確認 TYPE-4
-                    # 根因：target 無法解析 → rule engine 清空 kubectl_command + 設 blocked_reason
-                    # 系統不應繼續嘗試執行，提早返回讓 SRE 介入
-                    _yaml_blocked = _yaml_r.get("blocked_reason", "")
-                    if "INVALID_TARGET" in _yaml_blocked:
-                        logger.warning(
-                            "auto_execute_yaml_invalid_target",
-                            incident_id=incident.incident_id,
-                            alertname=_alertname_for_yaml,
-                            blocked_reason=_yaml_blocked,
-                        )
-                        token.state = DecisionState.READY
-                        token.proposal_data["auto_executed"] = False
-                        token.proposal_data["blocked_reason"] = _yaml_blocked
-                        await self._save_token(token)
-                        _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
-                        return
-
-                    _yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip()
-                    if _yaml_cmd and not _yaml_cmd.startswith("kubectl"):
-                        # YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 kubectl action
-                        action = _yaml_cmd
-                        logger.info(
-                            "auto_execute_yaml_cmd_override",
-                            incident_id=incident.incident_id,
-                            alertname=_alertname_for_yaml,
-                            yaml_cmd=_yaml_cmd[:80],
-                        )
-            except Exception as _yaml_err:
-                logger.debug("auto_execute_yaml_check_error", error=str(_yaml_err))
-
        # Phase 6 ADR-087: 自我降級守衛（AIOPS_P6_SELF_DEMOTION 控制）
        # SLO 違反 → 全域信心閾值調高；連續違反 → 保守模式，所有自動執行降為人工
        # 2026-04-15 ogt + Claude Sonnet 4.6（亞太）: Phase 6 初始建立
--- a/ops/monitoring/alerts-unified.yml
+++ b/ops/monitoring/alerts-unified.yml
@@ -77,6 +77,7 @@ groups:
        annotations:
          summary: "主機 {{ $labels.host }} 磁碟空間不足"
          description: "磁碟使用率超過 85%"
+          auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'"

  # =========================================================================
  # K8s 叢集告警 (kubernetes_alerts)
@@ -161,7 +162,10 @@ groups:
      # backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success
      # node-exporter textfile collector 讀取此檔案暴露指標
      - alert: HostBackupFailed
-        expr: time() - node_textfile_scrape_error{collector="backup_110"} == 0 or absent(node_textfile_scrape_error{collector="backup_110"}) or (time() - backup_110_last_success_timestamp > 90000)
+        # 2026-05-02 ogt + Claude Sonnet 4.6: 刪除 backup_110 collector label 判斷
+        # 根因：node_textfile_scrape_error 目前已不帶 collector 欄位，此條件在實際環境持續 absent()=true，導致告警永遠成立。
+        # 修法：僅以 backup_110_last_success_timestamp 是否缺失 / 是否超時為主判斷。
+        expr: absent(backup_110_last_success_timestamp) or (time() - backup_110_last_success_timestamp > 90000)
        for: 10m
        labels:
          severity: warning
@@ -174,6 +178,7 @@ groups:
        annotations:
          summary: "188 Host 備份超過 25 小時未成功"
          description: "backup-from-110.sh 可能失敗，/backup/110 資料可能過舊"
+          auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"

      # ADR-075: CoreDNS 解析失敗 (2026-04-12 ogt)
      - alert: CoreDNSResolutionFailed
--- a/ops/monitoring/alerts.yml
+++ b/ops/monitoring/alerts.yml
@@ -83,6 +83,7 @@ groups:
        annotations:
          summary: "主機 {{ $labels.host }} 磁碟空間不足"
          description: "磁碟使用率超過 85%"
+          auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'"

  # =========================================================================
  # K8s 叢集告警 (kubernetes_alerts)
@@ -167,7 +168,10 @@ groups:
      # backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success
      # node-exporter textfile collector 讀取此檔案暴露指標
      - alert: HostBackupFailed
-        expr: time() - node_textfile_scrape_error{collector="backup_110"} == 0 or absent(node_textfile_scrape_error{collector="backup_110"}) or (time() - backup_110_last_success_timestamp > 90000)
+        # 2026-05-02 ogt + Claude Sonnet 4.6: 刪除 backup_110 collector 標籤條件
+        # 根因：node_textfile_scrape_error 已移除 collector 欄位，原條件一直判斷為 absent，造成錯誤告警。
+        # 修法：以 backup_110_last_success_timestamp 是否缺失/超時判斷。
+        expr: absent(backup_110_last_success_timestamp) or (time() - backup_110_last_success_timestamp > 90000)
        for: 10m
        labels:
          severity: warning
@@ -180,6 +184,7 @@ groups:
        annotations:
          summary: "188 Host 備份超過 25 小時未成功"
          description: "backup-from-110.sh 可能失敗，/backup/110 資料可能過舊"
+          auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"

      # ADR-075: CoreDNS 解析失敗 (2026-04-12 ogt)
      - alert: CoreDNSResolutionFailed