diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml index c9a4e715..2095e495 100644 --- a/apps/api/alert_rules.yaml +++ b/apps/api/alert_rules.yaml @@ -568,6 +568,32 @@ rules: optimization: [] reasoning: "[規則匹配] E2E Smoke Test 假告警,僅確認告警鏈路暢通,無實際服務異常。" + # ── 備份失敗 ──────────────────────────────────────────────── + # 2026-04-11 Claude Sonnet 4.6: backup 類告警屬主機層,無 K8s deployment 可重啟 + # → TYPE-1 純資訊通知,不應出現 [重啟] 按鈕 + + - id: host_backup_failed + priority: 50 + description: 備份任務失敗 (rsync/velero/HostBackupFailed) + match: + alertname: + - HostBackupFailed + - VeleroBackupFailed + - VeleroBackupNotRun + - BackupJobFailed + response: + action_title: "備份失敗,需人工確認" + description: "⚠️ 備份任務失敗,無自動修復動作。請人工確認備份腳本及磁碟空間。" + suggested_action: NO_ACTION + kubectl_command: "" + estimated_downtime: "N/A" + risk: medium + responsibility: INFRA + responsibility_reasoning: "備份失敗屬基礎設施維運問題,需人工介入確認根因" + secondary_teams: [] + optimization: [] + reasoning: "[規則匹配] 備份失敗無法自動修復,需人工排查備份腳本、磁碟空間及網路連通性。" + # ── 通用兜底 ──────────────────────────────────────────────── - id: generic_fallback diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 7201c653..d5eeb5b8 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -112,7 +112,9 @@ async def _push_decision_to_telegram( # 延遲導入避免循環依賴 from src.core.redis_client import get_redis from src.services.telegram_gateway import ( + classify_notification, get_telegram_gateway, + NotificationType, ) # 🔴 去重檢查:同一個 incident 10 分鐘內只發一次 @@ -212,29 +214,59 @@ async def _push_decision_to_telegram( # 2026-03-27 ogt: 修復 INC-INC-INC- 重複前綴 bug approval_id = incident.incident_id # 已經是 INC-xxx 格式 - tg_result = await gateway.send_approval_card( - approval_id=approval_id, - risk_level=risk_level, - resource_name=target[:50], - root_cause=reasoning[:150] if reasoning else description[:150], # 2026-04-03 ogt: 移除 [LLM_xxx] prefix,擴大至 150 字 - suggested_action=action[:80] if action else "待分析", # 2026-04-03 ogt: 50→80 字 - estimated_downtime="5-15 min", - primary_responsibility="INFRA", + # ADR-071: 通知分類器 — 依告警類型/狀態決定卡片種類 + # 2026-04-11 Claude Sonnet 4.6: 接通 classify_notification(),原本死程式碼 + _auto_executed = proposal_data.get("auto_executed", False) + _decision_state = proposal_data.get("decision_state", "") + _mcp_all_failed = proposal_data.get("mcp_all_failed", False) + _notif_type = classify_notification( + incident=incident, confidence=confidence, - namespace=incident.signals[0].labels.get("namespace", "default") if incident.signals else "default", - ai_provider=ai_provider, # 2026-03-29 ogt: 顯示 AI 模型來源 - ai_model=ai_model, # 2026-04-04 ogt: 底層模型名稱 - # 2026-04-02 ogt: Phase 22 Nemotron 協作 (ADR-044) - nemotron_enabled=nemotron_enabled, - nemotron_tools=nemotron_tools, - nemotron_validation=nemotron_validation, - nemotron_latency_ms=nemotron_latency_ms, - nemotron_tool_model=nemotron_tool_model, - nemotron_tool_backend=nemotron_tool_backend, - # 2026-04-05 Claude Code: 傳入 incident_id 以啟用 detail/reanalyze/history 按鈕 - incident_id=incident.incident_id, + auto_executed=_auto_executed, + mcp_all_failed=_mcp_all_failed, + decision_state=_decision_state, ) + if _notif_type == NotificationType.TYPE_1: + # 純資訊通知 — 無按鈕 + tg_result = await gateway.send_info_notification( + incident_id=incident.incident_id, + title=incident.title or "告警通知", + message=reasoning[:200] if reasoning else description[:200], + alertname=incident.signals[0].labels.get("alertname", "") if incident.signals else "", + severity="info", + ) + elif _notif_type == NotificationType.TYPE_4_DRIFT: + # Config Drift 專屬卡片 + tg_result = await gateway.send_drift_card( + incident_id=incident.incident_id, + approval_id=approval_id, + resource_name=target[:50], + diff_summary=description[:500], + ) + else: + # TYPE-2 / TYPE-3 / TYPE-4 都走 send_approval_card(按鈕組合由 alert_category 決定) + tg_result = await gateway.send_approval_card( + approval_id=approval_id, + risk_level=risk_level, + resource_name=target[:50], + root_cause=reasoning[:150] if reasoning else description[:150], + suggested_action=action[:80] if action else "待分析", + estimated_downtime="5-15 min", + primary_responsibility="INFRA", + confidence=confidence, + namespace=incident.signals[0].labels.get("namespace", "default") if incident.signals else "default", + ai_provider=ai_provider, + ai_model=ai_model, + nemotron_enabled=nemotron_enabled, + nemotron_tools=nemotron_tools, + nemotron_validation=nemotron_validation, + nemotron_latency_ms=nemotron_latency_ms, + nemotron_tool_model=nemotron_tool_model, + nemotron_tool_backend=nemotron_tool_backend, + incident_id=incident.incident_id, + ) + # 2026-04-09 Claude Sonnet 4.6: 存 message_id → 後續狀態更新在原訊息延續 # 同時寫 Redis (快速查詢) 和 DB (持久化,不受 TTL 限制) tg_message_id = tg_result.get("result", {}).get("message_id") if isinstance(tg_result, dict) else None @@ -513,20 +545,27 @@ async def _resolve_target_from_k8s(incident: "Incident", namespace: str) -> str return None -async def _verify_k8s_deployment_exists(target: str, namespace: str) -> bool: +async def _verify_k8s_deployment_exists(target: str, namespace: str, alertname: str = "") -> bool: """ BUG-003 補救:呼叫 K8s MCP 確認 deployment/pod 是否真實存在。 - K8s MCP 不可用時 → 返回 True(不阻塞,保守策略)。 + K8s MCP 不可用時: + - 主機層告警 (Host*/Docker*) → 返回 False(阻止 K8s 操作) + - K8s 層告警 → 返回 True(保守放行,讓 kubectl 自行報錯) 2026-04-11 Claude Sonnet 4.6 Asia/Taipei """ + # 主機層告警前綴 — 這類告警的修復目標是 Docker container/主機,不是 K8s deployment + _HOST_ALERTNAME_PREFIXES = ("Host", "Docker", "Backup", "Velero", "SSH") + _is_host_alert = alertname.startswith(_HOST_ALERTNAME_PREFIXES) if alertname else False + try: from src.plugins.mcp.providers.k8s_provider import K8sProvider k8s = K8sProvider() if not k8s.enabled: - # MCP 不可用 → 保守放行,讓 kubectl 自行報錯 - return True + # 主機層告警:K8s MCP 不可用 → 拒絕(不應對主機層問題執行 K8s 操作) + # K8s 層告警:保守放行,讓 kubectl 自行報錯 + return not _is_host_alert result = await k8s.execute( tool_name="kubectl_get", @@ -544,8 +583,8 @@ async def _verify_k8s_deployment_exists(target: str, namespace: str) -> bool: except Exception as e: logger.debug("verify_k8s_deployment_exists_failed", target=target, error=str(e)) - # 例外時保守放行 - return True + # 例外時:主機層告警拒絕,其他保守放行 + return not _is_host_alert async def _fetch_metrics_snapshot(incident: Incident) -> dict: @@ -1097,6 +1136,9 @@ class DecisionManager: ) else: # 需人工審核: 推送到 Telegram + # ADR-071: 注入 decision_state + auto_executed 供 classify_notification 使用 + token.proposal_data["decision_state"] = token.state.value if token.state else "" + token.proposal_data["auto_executed"] = False _fire_and_forget( _push_decision_to_telegram(incident, token.proposal_data) ) @@ -1113,6 +1155,23 @@ class DecisionManager: """ action = token.proposal_data.get("kubectl_command", "") + # NO_ACTION 規則(備份失敗/E2E smoke test 等)— kubectl_command 為空,不執行,直接返回 + # 2026-04-11 Claude Sonnet 4.6: 防止空 action 或 NO_ACTION 字串進入自動執行流程 + _suggested_action = token.proposal_data.get("suggested_action", "") + if not action or _suggested_action == "NO_ACTION": + logger.info( + "auto_execute_skipped_no_action", + incident_id=incident.incident_id, + suggested_action=_suggested_action, + reason="規則標記 NO_ACTION 或 kubectl_command 為空,不執行自動修復", + ) + token.state = DecisionState.READY + await self._save_token(token) + _fire_and_forget( + _push_decision_to_telegram(incident, token.proposal_data) + ) + return + # 替換所有 placeholder — {target}/{namespace}/ 等 _target = incident.affected_services[0] if incident.affected_services else "unknown" _ns = "awoooi-prod" @@ -1158,7 +1217,7 @@ class DecisionManager: # 避免 LLM 產生的無效 deployment name(/alertname/unknown)通過 safety guard # 但仍對 K8s 發出錯誤指令 if _target and _target != "unknown": - _k8s_verified = await _verify_k8s_deployment_exists(_target, _ns) + _k8s_verified = await _verify_k8s_deployment_exists(_target, _ns, alertname=_alertname) if not _k8s_verified: logger.warning( "auto_execute_blocked_deployment_not_found",