fix(telegram): 接通 classify_notification + 修復 HostBackupFailed 亂送按鈕

三個問題同時修復:

1. classify_notification() 死程式碼接通
   - _push_decision_to_telegram() 現在先呼叫 classify_notification()
   - TYPE-1 (純資訊) → send_info_notification(),無按鈕
   - TYPE-4D (Config Drift) → send_drift_card()
   - 其餘 TYPE-2/3/4 → send_approval_card()(原有按鈕)
   - decision_state + auto_executed 從呼叫端注入 proposal_data

2. alert_rules.yaml 補 host_backup_failed 規則
   - HostBackupFailed / VeleroBackupFailed / VeleroBackupNotRun → NO_ACTION
   - 不再走 generic_fallback → 不再產生 kubectl rollout restart deployment/backup

3. _verify_k8s_deployment_exists() 主機層告警不再保守放行
   - Host*/Docker*/Backup*/Velero*/SSH* 前綴告警 → K8s MCP 不可用時 return False
   - _auto_execute() 收到 NO_ACTION 或空 kubectl_command → 早退,不執行

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-11 20:35:48 +08:00
parent 9382814d14
commit e447f97616
2 changed files with 112 additions and 27 deletions

View File

@@ -568,6 +568,32 @@ rules:
optimization: []
reasoning: "[規則匹配] E2E Smoke Test 假告警,僅確認告警鏈路暢通,無實際服務異常。"
# ── 備份失敗 ────────────────────────────────────────────────
# 2026-04-11 Claude Sonnet 4.6: backup 類告警屬主機層,無 K8s deployment 可重啟
# → TYPE-1 純資訊通知,不應出現 [重啟] 按鈕
- id: host_backup_failed
priority: 50
description: 備份任務失敗 (rsync/velero/HostBackupFailed)
match:
alertname:
- HostBackupFailed
- VeleroBackupFailed
- VeleroBackupNotRun
- BackupJobFailed
response:
action_title: "備份失敗,需人工確認"
description: "⚠️ 備份任務失敗,無自動修復動作。請人工確認備份腳本及磁碟空間。"
suggested_action: NO_ACTION
kubectl_command: ""
estimated_downtime: "N/A"
risk: medium
responsibility: INFRA
responsibility_reasoning: "備份失敗屬基礎設施維運問題,需人工介入確認根因"
secondary_teams: []
optimization: []
reasoning: "[規則匹配] 備份失敗無法自動修復,需人工排查備份腳本、磁碟空間及網路連通性。"
# ── 通用兜底 ────────────────────────────────────────────────
- id: generic_fallback

View File

@@ -112,7 +112,9 @@ async def _push_decision_to_telegram(
# 延遲導入避免循環依賴
from src.core.redis_client import get_redis
from src.services.telegram_gateway import (
classify_notification,
get_telegram_gateway,
NotificationType,
)
# 🔴 去重檢查:同一個 incident 10 分鐘內只發一次
@@ -212,29 +214,59 @@ async def _push_decision_to_telegram(
# 2026-03-27 ogt: 修復 INC-INC-INC- 重複前綴 bug
approval_id = incident.incident_id # 已經是 INC-xxx 格式
tg_result = await gateway.send_approval_card(
approval_id=approval_id,
risk_level=risk_level,
resource_name=target[:50],
root_cause=reasoning[:150] if reasoning else description[:150], # 2026-04-03 ogt: 移除 [LLM_xxx] prefix擴大至 150 字
suggested_action=action[:80] if action else "待分析", # 2026-04-03 ogt: 50→80 字
estimated_downtime="5-15 min",
primary_responsibility="INFRA",
# ADR-071: 通知分類器 — 依告警類型/狀態決定卡片種類
# 2026-04-11 Claude Sonnet 4.6: 接通 classify_notification(),原本死程式碼
_auto_executed = proposal_data.get("auto_executed", False)
_decision_state = proposal_data.get("decision_state", "")
_mcp_all_failed = proposal_data.get("mcp_all_failed", False)
_notif_type = classify_notification(
incident=incident,
confidence=confidence,
namespace=incident.signals[0].labels.get("namespace", "default") if incident.signals else "default",
ai_provider=ai_provider, # 2026-03-29 ogt: 顯示 AI 模型來源
ai_model=ai_model, # 2026-04-04 ogt: 底層模型名稱
# 2026-04-02 ogt: Phase 22 Nemotron 協作 (ADR-044)
nemotron_enabled=nemotron_enabled,
nemotron_tools=nemotron_tools,
nemotron_validation=nemotron_validation,
nemotron_latency_ms=nemotron_latency_ms,
nemotron_tool_model=nemotron_tool_model,
nemotron_tool_backend=nemotron_tool_backend,
# 2026-04-05 Claude Code: 傳入 incident_id 以啟用 detail/reanalyze/history 按鈕
incident_id=incident.incident_id,
auto_executed=_auto_executed,
mcp_all_failed=_mcp_all_failed,
decision_state=_decision_state,
)
if _notif_type == NotificationType.TYPE_1:
# 純資訊通知 — 無按鈕
tg_result = await gateway.send_info_notification(
incident_id=incident.incident_id,
title=incident.title or "告警通知",
message=reasoning[:200] if reasoning else description[:200],
alertname=incident.signals[0].labels.get("alertname", "") if incident.signals else "",
severity="info",
)
elif _notif_type == NotificationType.TYPE_4_DRIFT:
# Config Drift 專屬卡片
tg_result = await gateway.send_drift_card(
incident_id=incident.incident_id,
approval_id=approval_id,
resource_name=target[:50],
diff_summary=description[:500],
)
else:
# TYPE-2 / TYPE-3 / TYPE-4 都走 send_approval_card按鈕組合由 alert_category 決定)
tg_result = await gateway.send_approval_card(
approval_id=approval_id,
risk_level=risk_level,
resource_name=target[:50],
root_cause=reasoning[:150] if reasoning else description[:150],
suggested_action=action[:80] if action else "待分析",
estimated_downtime="5-15 min",
primary_responsibility="INFRA",
confidence=confidence,
namespace=incident.signals[0].labels.get("namespace", "default") if incident.signals else "default",
ai_provider=ai_provider,
ai_model=ai_model,
nemotron_enabled=nemotron_enabled,
nemotron_tools=nemotron_tools,
nemotron_validation=nemotron_validation,
nemotron_latency_ms=nemotron_latency_ms,
nemotron_tool_model=nemotron_tool_model,
nemotron_tool_backend=nemotron_tool_backend,
incident_id=incident.incident_id,
)
# 2026-04-09 Claude Sonnet 4.6: 存 message_id → 後續狀態更新在原訊息延續
# 同時寫 Redis (快速查詢) 和 DB (持久化,不受 TTL 限制)
tg_message_id = tg_result.get("result", {}).get("message_id") if isinstance(tg_result, dict) else None
@@ -513,20 +545,27 @@ async def _resolve_target_from_k8s(incident: "Incident", namespace: str) -> str
return None
async def _verify_k8s_deployment_exists(target: str, namespace: str) -> bool:
async def _verify_k8s_deployment_exists(target: str, namespace: str, alertname: str = "") -> bool:
"""
BUG-003 補救:呼叫 K8s MCP 確認 deployment/pod 是否真實存在。
K8s MCP 不可用時 → 返回 True不阻塞保守策略
K8s MCP 不可用時
- 主機層告警 (Host*/Docker*) → 返回 False阻止 K8s 操作)
- K8s 層告警 → 返回 True保守放行讓 kubectl 自行報錯)
2026-04-11 Claude Sonnet 4.6 Asia/Taipei
"""
# 主機層告警前綴 — 這類告警的修復目標是 Docker container/主機,不是 K8s deployment
_HOST_ALERTNAME_PREFIXES = ("Host", "Docker", "Backup", "Velero", "SSH")
_is_host_alert = alertname.startswith(_HOST_ALERTNAME_PREFIXES) if alertname else False
try:
from src.plugins.mcp.providers.k8s_provider import K8sProvider
k8s = K8sProvider()
if not k8s.enabled:
# MCP 不可用 → 保守放行,讓 kubectl 自行報錯
return True
# 主機層告警K8s MCP 不可用 → 拒絕(不應對主機層問題執行 K8s 操作)
# K8s 層告警:保守放行,讓 kubectl 自行報錯
return not _is_host_alert
result = await k8s.execute(
tool_name="kubectl_get",
@@ -544,8 +583,8 @@ async def _verify_k8s_deployment_exists(target: str, namespace: str) -> bool:
except Exception as e:
logger.debug("verify_k8s_deployment_exists_failed", target=target, error=str(e))
# 例外時保守放行
return True
# 例外時:主機層告警拒絕,其他保守放行
return not _is_host_alert
async def _fetch_metrics_snapshot(incident: Incident) -> dict:
@@ -1097,6 +1136,9 @@ class DecisionManager:
)
else:
# 需人工審核: 推送到 Telegram
# ADR-071: 注入 decision_state + auto_executed 供 classify_notification 使用
token.proposal_data["decision_state"] = token.state.value if token.state else ""
token.proposal_data["auto_executed"] = False
_fire_and_forget(
_push_decision_to_telegram(incident, token.proposal_data)
)
@@ -1113,6 +1155,23 @@ class DecisionManager:
"""
action = token.proposal_data.get("kubectl_command", "")
# NO_ACTION 規則(備份失敗/E2E smoke test 等)— kubectl_command 為空,不執行,直接返回
# 2026-04-11 Claude Sonnet 4.6: 防止空 action 或 NO_ACTION 字串進入自動執行流程
_suggested_action = token.proposal_data.get("suggested_action", "")
if not action or _suggested_action == "NO_ACTION":
logger.info(
"auto_execute_skipped_no_action",
incident_id=incident.incident_id,
suggested_action=_suggested_action,
reason="規則標記 NO_ACTION 或 kubectl_command 為空,不執行自動修復",
)
token.state = DecisionState.READY
await self._save_token(token)
_fire_and_forget(
_push_decision_to_telegram(incident, token.proposal_data)
)
return
# 替換所有 placeholder — {target}/{namespace}/<deployment_name> 等
_target = incident.affected_services[0] if incident.affected_services else "unknown"
_ns = "awoooi-prod"
@@ -1158,7 +1217,7 @@ class DecisionManager:
# 避免 LLM 產生的無效 deployment name<placeholder>/alertname/unknown通過 safety guard
# 但仍對 K8s 發出錯誤指令
if _target and _target != "unknown":
_k8s_verified = await _verify_k8s_deployment_exists(_target, _ns)
_k8s_verified = await _verify_k8s_deployment_exists(_target, _ns, alertname=_alertname)
if not _k8s_verified:
logger.warning(
"auto_execute_blocked_deployment_not_found",