fix: E2E test 告警識別 + 自動修復結果 Telegram 通知
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
**alert_rule_engine.py** - _matches() 加入 instance_prefix 匹配(最高優先) - match_rule() 傳入 instance label 至 _matches - 用途: e2e-final-* / e2e-test-* instance 可被 YAML 規則識別 **alert_rules.yaml** - 新增 e2e_smoke_test 規則 (priority=120) - alertname: E2E_SMOKE_TEST / instance_prefix: e2e-final- / e2e-test- / test-host - suggested_action: NO_ACTION,顯示「告警鏈路驗證成功」 **decision_manager.py** - _auto_execute() 成功後發 Telegram 結果通知 ✅ - _auto_execute() 失敗後發 Telegram 失敗通知 ❌ - 新增 _push_auto_repair_result() 函數 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -510,6 +510,44 @@ rules:
|
||||
command: "curl -s http://192.168.0.125:32334/api/v1/ai-router/reset -X POST"
|
||||
reasoning: "[規則匹配] AI Router 已自動降級至備援 Provider,監控熔斷器恢復狀態即可。"
|
||||
|
||||
# ── E2E / Smoke Test 告警 ────────────────────────────────────
|
||||
# 2026-04-09 Claude Sonnet 4.6: E2E test 假告警識別,僅記錄不修復
|
||||
|
||||
- id: e2e_smoke_test
|
||||
priority: 120
|
||||
description: E2E Smoke Test / 告警鏈路驗證假告警
|
||||
match:
|
||||
alertname:
|
||||
- E2E_SMOKE_TEST
|
||||
- E2E_FINAL_SMOKE_TEST
|
||||
- SmokeTest
|
||||
instance_prefix:
|
||||
- e2e-final-
|
||||
- e2e-test-
|
||||
- test-host
|
||||
- smoke-test-
|
||||
message:
|
||||
- e2e smoke test
|
||||
- smoke test
|
||||
- please ignore
|
||||
- e2e test
|
||||
- e2e-final
|
||||
- e2e-test
|
||||
- e2e_smoke
|
||||
- alert chain smoke
|
||||
response:
|
||||
action_title: "告警鏈路驗證成功 (E2E)"
|
||||
description: "✅ E2E Smoke Test 告警已收到,告警鏈路正常。此告警僅用於驗證,無需修復動作。"
|
||||
suggested_action: NO_ACTION
|
||||
kubectl_command: ""
|
||||
estimated_downtime: "N/A"
|
||||
risk: low
|
||||
responsibility: INFRA
|
||||
responsibility_reasoning: "E2E smoke test 假告警,告警鏈路驗證用途,系統自動識別跳過修復"
|
||||
secondary_teams: []
|
||||
optimization: []
|
||||
reasoning: "[規則匹配] E2E Smoke Test 假告警,僅確認告警鏈路暢通,無實際服務異常。"
|
||||
|
||||
# ── 通用兜底 ────────────────────────────────────────────────
|
||||
|
||||
- id: generic_fallback
|
||||
|
||||
@@ -101,7 +101,7 @@ def _load_rules() -> list[dict]:
|
||||
# ── 匹配邏輯 ────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _matches(rule: dict, alertname: str, alert_type: str, message: str) -> bool:
|
||||
def _matches(rule: dict, alertname: str, alert_type: str, message: str, instance: str = "") -> bool:
|
||||
"""判斷規則是否匹配。通用兜底規則(alertname=["*"])永遠回傳 False,由 match_rule 單獨處理。"""
|
||||
match = rule.get("match", {})
|
||||
|
||||
@@ -110,6 +110,13 @@ def _matches(rule: dict, alertname: str, alert_type: str, message: str) -> bool:
|
||||
if alertnames == ["*"]:
|
||||
return False
|
||||
|
||||
# instance/target 前綴匹配 (最高優先,用於 E2E test 告警識別)
|
||||
# 2026-04-09 Claude Sonnet 4.6: 支援 instance_prefix 匹配,讓 E2E test 告警不走 generic_fallback
|
||||
if instance and match.get("instance_prefix"):
|
||||
for prefix in match["instance_prefix"]:
|
||||
if instance.lower().startswith(prefix.lower()):
|
||||
return True
|
||||
|
||||
# alertname 完全匹配
|
||||
if alertnames and alertname in alertnames:
|
||||
return True
|
||||
@@ -153,11 +160,13 @@ def match_rule(alert_context: dict) -> dict[str, Any] | None:
|
||||
rules = _load_rules()
|
||||
vars = _extract_vars(alert_context)
|
||||
|
||||
instance = labels.get("instance", alert_context.get("target_resource", ""))
|
||||
|
||||
matched_rule = None
|
||||
for rule in rules:
|
||||
if _is_generic(rule):
|
||||
continue # 通用兜底最後才用
|
||||
if _matches(rule, alertname, alert_type, message):
|
||||
if _matches(rule, alertname, alert_type, message, instance):
|
||||
matched_rule = rule
|
||||
break
|
||||
|
||||
|
||||
@@ -196,6 +196,51 @@ async def _push_decision_to_telegram(
|
||||
)
|
||||
|
||||
|
||||
async def _push_auto_repair_result(
|
||||
incident: Incident,
|
||||
action: str,
|
||||
success: bool,
|
||||
error: str = "",
|
||||
) -> None:
|
||||
"""
|
||||
自動修復執行後,發 Telegram 結果通知
|
||||
統帥要求: 修復結果必須對應到同一告警訊息
|
||||
2026-04-09 Claude Sonnet 4.6 Asia/Taipei
|
||||
"""
|
||||
try:
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
|
||||
gateway = get_telegram_gateway()
|
||||
target = incident.affected_services[0] if incident.affected_services else "unknown"
|
||||
inc_id = incident.incident_id
|
||||
|
||||
if success:
|
||||
text = (
|
||||
f"✅ <b>[自動修復完成]</b>\n"
|
||||
f"├ 事件: <code>{inc_id}</code>\n"
|
||||
f"├ 對象: <code>{target[:50]}</code>\n"
|
||||
f"└ 執行: <code>{action[:80] if action else '已執行'}</code>"
|
||||
)
|
||||
else:
|
||||
text = (
|
||||
f"❌ <b>[自動修復失敗]</b>\n"
|
||||
f"├ 事件: <code>{inc_id}</code>\n"
|
||||
f"├ 對象: <code>{target[:50]}</code>\n"
|
||||
f"├ 動作: <code>{action[:80] if action else '未知'}</code>\n"
|
||||
f"└ 錯誤: {error[:100] if error else '未知錯誤'}"
|
||||
)
|
||||
|
||||
await gateway.send_notification(text)
|
||||
logger.info(
|
||||
"auto_repair_result_sent",
|
||||
incident_id=inc_id,
|
||||
success=success,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("auto_repair_result_push_failed", incident_id=incident.incident_id, error=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Decision States
|
||||
# =============================================================================
|
||||
@@ -544,7 +589,11 @@ class DecisionManager:
|
||||
ADR-030 Phase 4: 自動執行已批准的操作
|
||||
|
||||
僅當 AutoApprovePolicy 判斷可自動執行時呼叫
|
||||
執行後發 Telegram 結果通知 (統帥要求: 修復結果對應同一告警)
|
||||
2026-04-09 Claude Sonnet 4.6 Asia/Taipei
|
||||
"""
|
||||
action = token.proposal_data.get("kubectl_command", "")
|
||||
|
||||
try:
|
||||
# 延遲導入避免循環依賴
|
||||
from src.models.approval import ApprovalRequest, ApprovalStatus
|
||||
@@ -553,7 +602,7 @@ class DecisionManager:
|
||||
# 建立虛擬 ApprovalRequest
|
||||
approval = ApprovalRequest(
|
||||
incident_id=incident.incident_id,
|
||||
action=token.proposal_data.get("kubectl_command", ""),
|
||||
action=action,
|
||||
status=ApprovalStatus.APPROVED,
|
||||
risk_level=token.proposal_data.get("risk_level", "low"),
|
||||
)
|
||||
@@ -573,6 +622,11 @@ class DecisionManager:
|
||||
action=approval.action,
|
||||
)
|
||||
|
||||
# 2026-04-09 Claude Sonnet 4.6: 執行成功 → 發 Telegram 結果通知
|
||||
asyncio.create_task(
|
||||
_push_auto_repair_result(incident, action, success=True)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"auto_execute_failed",
|
||||
@@ -583,7 +637,10 @@ class DecisionManager:
|
||||
token.error = f"Auto-execute failed: {e}"
|
||||
await self._save_token(token)
|
||||
|
||||
# 失敗時 fallback 到人工審核
|
||||
# 2026-04-09 Claude Sonnet 4.6: 執行失敗 → 發 Telegram 失敗通知 + fallback 人工
|
||||
asyncio.create_task(
|
||||
_push_auto_repair_result(incident, action, success=False, error=str(e))
|
||||
)
|
||||
asyncio.create_task(
|
||||
_push_decision_to_telegram(incident, token.proposal_data)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user