diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml index c0f09a97..8f8d0ba0 100644 --- a/apps/api/alert_rules.yaml +++ b/apps/api/alert_rules.yaml @@ -510,6 +510,44 @@ rules: command: "curl -s http://192.168.0.125:32334/api/v1/ai-router/reset -X POST" reasoning: "[規則匹配] AI Router 已自動降級至備援 Provider,監控熔斷器恢復狀態即可。" + # ── E2E / Smoke Test 告警 ──────────────────────────────────── + # 2026-04-09 Claude Sonnet 4.6: E2E test 假告警識別,僅記錄不修復 + + - id: e2e_smoke_test + priority: 120 + description: E2E Smoke Test / 告警鏈路驗證假告警 + match: + alertname: + - E2E_SMOKE_TEST + - E2E_FINAL_SMOKE_TEST + - SmokeTest + instance_prefix: + - e2e-final- + - e2e-test- + - test-host + - smoke-test- + message: + - e2e smoke test + - smoke test + - please ignore + - e2e test + - e2e-final + - e2e-test + - e2e_smoke + - alert chain smoke + response: + action_title: "告警鏈路驗證成功 (E2E)" + description: "✅ E2E Smoke Test 告警已收到,告警鏈路正常。此告警僅用於驗證,無需修復動作。" + suggested_action: NO_ACTION + kubectl_command: "" + estimated_downtime: "N/A" + risk: low + responsibility: INFRA + responsibility_reasoning: "E2E smoke test 假告警,告警鏈路驗證用途,系統自動識別跳過修復" + secondary_teams: [] + optimization: [] + reasoning: "[規則匹配] E2E Smoke Test 假告警,僅確認告警鏈路暢通,無實際服務異常。" + # ── 通用兜底 ──────────────────────────────────────────────── - id: generic_fallback diff --git a/apps/api/src/services/alert_rule_engine.py b/apps/api/src/services/alert_rule_engine.py index 822f48b8..8df2f64f 100644 --- a/apps/api/src/services/alert_rule_engine.py +++ b/apps/api/src/services/alert_rule_engine.py @@ -101,7 +101,7 @@ def _load_rules() -> list[dict]: # ── 匹配邏輯 ──────────────────────────────────────────────── -def _matches(rule: dict, alertname: str, alert_type: str, message: str) -> bool: +def _matches(rule: dict, alertname: str, alert_type: str, message: str, instance: str = "") -> bool: """判斷規則是否匹配。通用兜底規則(alertname=["*"])永遠回傳 False,由 match_rule 單獨處理。""" match = rule.get("match", {}) @@ -110,6 +110,13 @@ def _matches(rule: dict, alertname: str, alert_type: str, message: str) -> bool: if alertnames == ["*"]: return False + # instance/target 前綴匹配 (最高優先,用於 E2E test 告警識別) + # 2026-04-09 Claude Sonnet 4.6: 支援 instance_prefix 匹配,讓 E2E test 告警不走 generic_fallback + if instance and match.get("instance_prefix"): + for prefix in match["instance_prefix"]: + if instance.lower().startswith(prefix.lower()): + return True + # alertname 完全匹配 if alertnames and alertname in alertnames: return True @@ -153,11 +160,13 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None: rules = _load_rules() vars = _extract_vars(alert_context) + instance = labels.get("instance", alert_context.get("target_resource", "")) + matched_rule = None for rule in rules: if _is_generic(rule): continue # 通用兜底最後才用 - if _matches(rule, alertname, alert_type, message): + if _matches(rule, alertname, alert_type, message, instance): matched_rule = rule break diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 87e5c8bd..bdf88ecc 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -196,6 +196,51 @@ async def _push_decision_to_telegram( ) +async def _push_auto_repair_result( + incident: Incident, + action: str, + success: bool, + error: str = "", +) -> None: + """ + 自動修復執行後,發 Telegram 結果通知 + 統帥要求: 修復結果必須對應到同一告警訊息 + 2026-04-09 Claude Sonnet 4.6 Asia/Taipei + """ + try: + from src.services.telegram_gateway import get_telegram_gateway + + gateway = get_telegram_gateway() + target = incident.affected_services[0] if incident.affected_services else "unknown" + inc_id = incident.incident_id + + if success: + text = ( + f"✅ [自動修復完成]\n" + f"├ 事件: {inc_id}\n" + f"├ 對象: {target[:50]}\n" + f"└ 執行: {action[:80] if action else '已執行'}" + ) + else: + text = ( + f"❌ [自動修復失敗]\n" + f"├ 事件: {inc_id}\n" + f"├ 對象: {target[:50]}\n" + f"├ 動作: {action[:80] if action else '未知'}\n" + f"└ 錯誤: {error[:100] if error else '未知錯誤'}" + ) + + await gateway.send_notification(text) + logger.info( + "auto_repair_result_sent", + incident_id=inc_id, + success=success, + ) + + except Exception as e: + logger.warning("auto_repair_result_push_failed", incident_id=incident.incident_id, error=str(e)) + + # ============================================================================= # Decision States # ============================================================================= @@ -544,7 +589,11 @@ class DecisionManager: ADR-030 Phase 4: 自動執行已批准的操作 僅當 AutoApprovePolicy 判斷可自動執行時呼叫 + 執行後發 Telegram 結果通知 (統帥要求: 修復結果對應同一告警) + 2026-04-09 Claude Sonnet 4.6 Asia/Taipei """ + action = token.proposal_data.get("kubectl_command", "") + try: # 延遲導入避免循環依賴 from src.models.approval import ApprovalRequest, ApprovalStatus @@ -553,7 +602,7 @@ class DecisionManager: # 建立虛擬 ApprovalRequest approval = ApprovalRequest( incident_id=incident.incident_id, - action=token.proposal_data.get("kubectl_command", ""), + action=action, status=ApprovalStatus.APPROVED, risk_level=token.proposal_data.get("risk_level", "low"), ) @@ -573,6 +622,11 @@ class DecisionManager: action=approval.action, ) + # 2026-04-09 Claude Sonnet 4.6: 執行成功 → 發 Telegram 結果通知 + asyncio.create_task( + _push_auto_repair_result(incident, action, success=True) + ) + except Exception as e: logger.error( "auto_execute_failed", @@ -583,7 +637,10 @@ class DecisionManager: token.error = f"Auto-execute failed: {e}" await self._save_token(token) - # 失敗時 fallback 到人工審核 + # 2026-04-09 Claude Sonnet 4.6: 執行失敗 → 發 Telegram 失敗通知 + fallback 人工 + asyncio.create_task( + _push_auto_repair_result(incident, action, success=False, error=str(e)) + ) asyncio.create_task( _push_decision_to_telegram(incident, token.proposal_data) )