diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml
index c0f09a97..8f8d0ba0 100644
--- a/apps/api/alert_rules.yaml
+++ b/apps/api/alert_rules.yaml
@@ -510,6 +510,44 @@ rules:
command: "curl -s http://192.168.0.125:32334/api/v1/ai-router/reset -X POST"
reasoning: "[規則匹配] AI Router 已自動降級至備援 Provider,監控熔斷器恢復狀態即可。"
+ # ── E2E / Smoke Test 告警 ────────────────────────────────────
+ # 2026-04-09 Claude Sonnet 4.6: E2E test 假告警識別,僅記錄不修復
+
+ - id: e2e_smoke_test
+ priority: 120
+ description: E2E Smoke Test / 告警鏈路驗證假告警
+ match:
+ alertname:
+ - E2E_SMOKE_TEST
+ - E2E_FINAL_SMOKE_TEST
+ - SmokeTest
+ instance_prefix:
+ - e2e-final-
+ - e2e-test-
+ - test-host
+ - smoke-test-
+ message:
+ - e2e smoke test
+ - smoke test
+ - please ignore
+ - e2e test
+ - e2e-final
+ - e2e-test
+ - e2e_smoke
+ - alert chain smoke
+ response:
+ action_title: "告警鏈路驗證成功 (E2E)"
+ description: "✅ E2E Smoke Test 告警已收到,告警鏈路正常。此告警僅用於驗證,無需修復動作。"
+ suggested_action: NO_ACTION
+ kubectl_command: ""
+ estimated_downtime: "N/A"
+ risk: low
+ responsibility: INFRA
+ responsibility_reasoning: "E2E smoke test 假告警,告警鏈路驗證用途,系統自動識別跳過修復"
+ secondary_teams: []
+ optimization: []
+ reasoning: "[規則匹配] E2E Smoke Test 假告警,僅確認告警鏈路暢通,無實際服務異常。"
+
# ── 通用兜底 ────────────────────────────────────────────────
- id: generic_fallback
diff --git a/apps/api/src/services/alert_rule_engine.py b/apps/api/src/services/alert_rule_engine.py
index 822f48b8..8df2f64f 100644
--- a/apps/api/src/services/alert_rule_engine.py
+++ b/apps/api/src/services/alert_rule_engine.py
@@ -101,7 +101,7 @@ def _load_rules() -> list[dict]:
# ── 匹配邏輯 ────────────────────────────────────────────────
-def _matches(rule: dict, alertname: str, alert_type: str, message: str) -> bool:
+def _matches(rule: dict, alertname: str, alert_type: str, message: str, instance: str = "") -> bool:
"""判斷規則是否匹配。通用兜底規則(alertname=["*"])永遠回傳 False,由 match_rule 單獨處理。"""
match = rule.get("match", {})
@@ -110,6 +110,13 @@ def _matches(rule: dict, alertname: str, alert_type: str, message: str) -> bool:
if alertnames == ["*"]:
return False
+ # instance/target 前綴匹配 (最高優先,用於 E2E test 告警識別)
+ # 2026-04-09 Claude Sonnet 4.6: 支援 instance_prefix 匹配,讓 E2E test 告警不走 generic_fallback
+ if instance and match.get("instance_prefix"):
+ for prefix in match["instance_prefix"]:
+ if instance.lower().startswith(prefix.lower()):
+ return True
+
# alertname 完全匹配
if alertnames and alertname in alertnames:
return True
@@ -153,11 +160,13 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None:
rules = _load_rules()
vars = _extract_vars(alert_context)
+ instance = labels.get("instance", alert_context.get("target_resource", ""))
+
matched_rule = None
for rule in rules:
if _is_generic(rule):
continue # 通用兜底最後才用
- if _matches(rule, alertname, alert_type, message):
+ if _matches(rule, alertname, alert_type, message, instance):
matched_rule = rule
break
diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py
index 87e5c8bd..bdf88ecc 100644
--- a/apps/api/src/services/decision_manager.py
+++ b/apps/api/src/services/decision_manager.py
@@ -196,6 +196,51 @@ async def _push_decision_to_telegram(
)
+async def _push_auto_repair_result(
+ incident: Incident,
+ action: str,
+ success: bool,
+ error: str = "",
+) -> None:
+ """
+ 自動修復執行後,發 Telegram 結果通知
+ 統帥要求: 修復結果必須對應到同一告警訊息
+ 2026-04-09 Claude Sonnet 4.6 Asia/Taipei
+ """
+ try:
+ from src.services.telegram_gateway import get_telegram_gateway
+
+ gateway = get_telegram_gateway()
+ target = incident.affected_services[0] if incident.affected_services else "unknown"
+ inc_id = incident.incident_id
+
+ if success:
+ text = (
+ f"✅ [自動修復完成]\n"
+ f"├ 事件: {inc_id}\n"
+ f"├ 對象: {target[:50]}\n"
+ f"└ 執行: {action[:80] if action else '已執行'}"
+ )
+ else:
+ text = (
+ f"❌ [自動修復失敗]\n"
+ f"├ 事件: {inc_id}\n"
+ f"├ 對象: {target[:50]}\n"
+ f"├ 動作: {action[:80] if action else '未知'}\n"
+ f"└ 錯誤: {error[:100] if error else '未知錯誤'}"
+ )
+
+ await gateway.send_notification(text)
+ logger.info(
+ "auto_repair_result_sent",
+ incident_id=inc_id,
+ success=success,
+ )
+
+ except Exception as e:
+ logger.warning("auto_repair_result_push_failed", incident_id=incident.incident_id, error=str(e))
+
+
# =============================================================================
# Decision States
# =============================================================================
@@ -544,7 +589,11 @@ class DecisionManager:
ADR-030 Phase 4: 自動執行已批准的操作
僅當 AutoApprovePolicy 判斷可自動執行時呼叫
+ 執行後發 Telegram 結果通知 (統帥要求: 修復結果對應同一告警)
+ 2026-04-09 Claude Sonnet 4.6 Asia/Taipei
"""
+ action = token.proposal_data.get("kubectl_command", "")
+
try:
# 延遲導入避免循環依賴
from src.models.approval import ApprovalRequest, ApprovalStatus
@@ -553,7 +602,7 @@ class DecisionManager:
# 建立虛擬 ApprovalRequest
approval = ApprovalRequest(
incident_id=incident.incident_id,
- action=token.proposal_data.get("kubectl_command", ""),
+ action=action,
status=ApprovalStatus.APPROVED,
risk_level=token.proposal_data.get("risk_level", "low"),
)
@@ -573,6 +622,11 @@ class DecisionManager:
action=approval.action,
)
+ # 2026-04-09 Claude Sonnet 4.6: 執行成功 → 發 Telegram 結果通知
+ asyncio.create_task(
+ _push_auto_repair_result(incident, action, success=True)
+ )
+
except Exception as e:
logger.error(
"auto_execute_failed",
@@ -583,7 +637,10 @@ class DecisionManager:
token.error = f"Auto-execute failed: {e}"
await self._save_token(token)
- # 失敗時 fallback 到人工審核
+ # 2026-04-09 Claude Sonnet 4.6: 執行失敗 → 發 Telegram 失敗通知 + fallback 人工
+ asyncio.create_task(
+ _push_auto_repair_result(incident, action, success=False, error=str(e))
+ )
asyncio.create_task(
_push_decision_to_telegram(incident, token.proposal_data)
)