diff --git a/apps/api/src/services/failover_alerter.py b/apps/api/src/services/failover_alerter.py index 8e033de6..3885a2a5 100644 --- a/apps/api/src/services/failover_alerter.py +++ b/apps/api/src/services/failover_alerter.py @@ -134,46 +134,7 @@ class FailoverAlerter: logger.debug("governance_alert_dedup_skipped", event_type=event_type) return - status = _escape_md(str(payload.get("status", "warning"))) - impact = _as_dict(payload.get("impact")) - remediation = _as_dict(payload.get("remediation")) - actionable = _as_dict(payload.get("actionable")) - - impact_lines = _lines_from_dict(impact, max_items=12, compact=True) - remediation_lines = _lines_from_list(remediation.get("items")) - remediation_next_action = remediation.get("next_action") - remediation_hint = remediation.get("hint") - actionable_lines = _lines_from_list(actionable.get("items")) - - next_action_line = "" - if remediation_next_action: - next_action_line = f"\n 下一步:{_escape_md(str(remediation_next_action))}" - if remediation_hint: - next_action_line += f"\n 提示:{_escape_md(str(remediation_hint))}" - - sections: list[str] = [ - "⚠️ *AI 治理警報*", - f"\n類型:{_escape_md(event_type)}", - f"狀態:{status}", - ] - if impact_lines: - sections.append(f"\n*影響*\n{impact_lines}") - if remediation_lines or next_action_line: - sections.append("\n*修復方向*") - if remediation_lines: - sections.append(remediation_lines) - if next_action_line: - sections.append(next_action_line) - if actionable_lines: - sections.append(f"\n*可直接自動化*\n{actionable_lines}") - - fallback_items = _fallback_pairs(payload, keep={"status", "impact", "remediation", "actionable"}) - if fallback_items: - sections.append( - "\n*欄位快覽(備援)*\n" + "\n".join(fallback_items) - ) - - msg = "\n".join(sections) + msg = format_governance_alert_card(event_type, payload) await self._send(msg) logger.info("governance_alert_sent", event_type=event_type) @@ -336,6 +297,180 @@ def _as_dict(value: Any) -> dict[str, Any]: return value if isinstance(value, dict) else {} +_EVENT_DISPLAY_NAMES = { + "trust_drift": "信任漂移", + "knowledge_degradation": "知識庫劣化", + "governance_slo_data_gap": "SLO 資料缺口", + "governance_self_failure": "治理自檢失敗", + "llm_hallucination": "LLM 驗證失敗", + "execution_blast_radius": "執行風險擴大", +} + +_STATUS_BADGES = { + "critical": "🔴 critical", + "error": "🔴 error", + "violation": "🔴 violation", + "warning": "🟡 warning", + "degraded": "🟠 degraded", + "ok": "🟢 ok", +} + +_IMPACT_PROFILES: dict[str, list[tuple[str, str]]] = { + "trust_drift": [ + ("drifted_count", "漂移 Playbook"), + ("total_playbooks", "總 Playbook"), + ("drift_ratio", "漂移比例"), + ("threshold", "警戒門檻"), + ("auto_deprecated_count", "自動停用"), + ], + "knowledge_degradation": [ + ("stale_count", "陳舊 KM"), + ("total_count", "總 KM"), + ("stale_ratio", "陳舊比例"), + ("threshold", "警戒門檻"), + ("stale_days", "陳舊天數"), + ], + "governance_slo_data_gap": [ + ("reason", "缺口原因"), + ("skipped_count", "略過指標"), + ("all_slo_metrics_not_emitted", "SLO 指標缺失"), + ], + "governance_self_failure": [ + ("failed_checks", "失敗檢查"), + ("total_checks", "總檢查"), + ("failure_rate", "失敗比例"), + ], + "execution_blast_radius": [ + ("affected_services", "受影響服務"), + ("blast_radius", "爆炸半徑"), + ("threshold", "警戒門檻"), + ], + "llm_hallucination": [ + ("failed", "驗證失敗"), + ("rate", "失敗比例"), + ("threshold", "警戒門檻"), + ], +} + + +def _event_display_name(event_type: str) -> str: + if event_type in _EVENT_DISPLAY_NAMES: + return _EVENT_DISPLAY_NAMES[event_type] + if event_type.startswith("slo_"): + return "SLO 違反" + return event_type.replace("_", " ").strip().title() + + +def _status_badge(status: Any) -> str: + status_text = str(status or "warning") + return _STATUS_BADGES.get(status_text.lower(), status_text) + + +def _format_metric_value(key: str, value: Any) -> str: + if isinstance(value, bool): + return "是" if value else "否" + if isinstance(value, (float, int)) and ( + key.endswith("_ratio") or key in {"threshold", "rate", "failure_rate"} + ): + return f"{float(value) * 100:.1f}%" + if isinstance(value, list): + if not value: + return "0" + shown = ", ".join(str(item) for item in value[:3]) + if len(value) > 3: + shown += f"…(共 {len(value)})" + return shown + return str(value) + + +def _profiled_rows(event_type: str, data: dict[str, Any], *, max_rows: int = 8) -> list[str]: + if not data: + return [] + + used: set[str] = set() + rows: list[str] = [] + for key, label in _IMPACT_PROFILES.get(event_type, []): + if key in data: + rows.append(f"{label}:{_format_metric_value(key, data[key])}") + used.add(key) + + for key in sorted(data.keys()): + if len(rows) >= max_rows: + break + if key in used: + continue + rows.append(f"{key}:{_format_metric_value(key, data[key])}") + + if len(data) > len(used) + max(0, max_rows - len(rows)): + rows.append("更多欄位已收斂至 AwoooP 稽核資料") + return rows[:max_rows] + + +def _tree_lines(rows: list[str]) -> str: + if not rows: + return "" + rendered: list[str] = [] + for idx, row in enumerate(rows): + branch = "└" if idx == len(rows) - 1 else "├" + rendered.append(f"{branch} {_escape_md(str(row))}") + return "\n".join(rendered) + + +def _governance_summary_lines(event_type: str, impact: dict[str, Any]) -> str: + rows = _profiled_rows(event_type, impact) + return _tree_lines(rows) + + +def format_governance_alert_card(event_type: str, payload: dict[str, Any]) -> str: + """格式化 AI 治理 Telegram 卡片。 + + 2026-05-07 Codex — 保留治理 payload,僅在 Telegram 邊界層把 raw key/value + 轉成可掃描卡片,避免大量純文字欄位洗版。 + """ + payload = payload if isinstance(payload, dict) else {} + impact = _as_dict(payload.get("impact")) + remediation = _as_dict(payload.get("remediation")) + actionable = _as_dict(payload.get("actionable")) + status = payload.get("status", "warning") + + sections: list[str] = [ + f"⚠️ *AI 治理警報|{_escape_md(_event_display_name(event_type))}*", + "──────────────────────", + f"類型:{_escape_md(event_type)}", + f"狀態:{_escape_md(_status_badge(status))}", + ] + + impact_lines = _governance_summary_lines(event_type, impact) + if impact_lines: + sections.extend(["", "🧭 *影響摘要*", impact_lines]) + + remediation_lines = _lines_from_list(remediation.get("items")) + remediation_next_action = remediation.get("next_action") + remediation_hint = remediation.get("hint") + if remediation_lines or remediation_next_action or remediation_hint: + sections.extend(["", "🛠️ *修復方向*"]) + if remediation_lines: + sections.append(remediation_lines) + if remediation_next_action: + sections.append(f"▶️ 下一步:{_escape_md(str(remediation_next_action))}") + if remediation_hint: + sections.append(f"💡 提示:{_escape_md(str(remediation_hint))}") + + actionable_lines = _lines_from_list(actionable.get("items")) + if actionable_lines: + sections.extend(["", "🤖 *可自動化工作*", actionable_lines]) + + fallback_items = _fallback_pairs( + payload, + keep={"status", "impact", "remediation", "actionable"}, + max_items=4, + ) + if fallback_items: + sections.extend(["", "📎 *補充欄位*", "\n".join(fallback_items)]) + + return "\n".join(sections) + + def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool = False) -> str: if not data: return "" @@ -360,7 +495,12 @@ def _lines_from_list(value: Any) -> str: ) -def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> list[str]: +def _fallback_pairs( + payload: dict[str, Any], + keep: set[str] | None = None, + *, + max_items: int | None = None, +) -> list[str]: if not isinstance(payload, dict): return [] keep = set(keep or set()) @@ -368,6 +508,9 @@ def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> li for key in sorted(payload.keys()): if key in keep: continue + if max_items is not None and len(rows) >= max_items: + rows.append(_escape_md("更多欄位已收斂至 AwoooP 稽核資料")) + break rows.append(f"{_escape_md(str(key))}:{_escape_md(str(payload.get(key)))}") return rows diff --git a/apps/api/src/services/runbook_generator.py b/apps/api/src/services/runbook_generator.py index 6b06ecd7..5bca4f0e 100644 --- a/apps/api/src/services/runbook_generator.py +++ b/apps/api/src/services/runbook_generator.py @@ -24,12 +24,19 @@ Runbook Generator - Phase 25 P1 Knowledge Auto-Harvesting from __future__ import annotations import asyncio +import html +import re import time from typing import TYPE_CHECKING import structlog -from src.models.knowledge import EntrySource, EntryStatus, EntryType, KnowledgeEntryCreate +from src.models.knowledge import ( + EntrySource, + EntryStatus, + EntryType, + KnowledgeEntryCreate, +) if TYPE_CHECKING: from src.models.incident import Incident @@ -38,6 +45,88 @@ if TYPE_CHECKING: logger = structlog.get_logger(__name__) +_CARD_MAX_LEN = 3600 +_SECTION_RE = re.compile(r"^#{1,6}\s+(?P
{_html(incident_id)}\n"
+ f"🧩 受影響服務:{_html(services)}\n"
+ "🧠 知識狀態:DRAFT|需人工審核\n"
+ f"🗂️ Entry ID:{_html(entry_id)}\n\n"
+ "🧾 內容摘要\n"
+ f"├ 症狀:{_html(symptom)}\n"
+ f"└ 執行:{_html(step)}\n\n"
+ "✅ 審核重點\n"
+ "1. 確認步驟可重跑,且不含 placeholder / 不支援 scheme\n"
+ "2. 補齊適用條件、rollback 與驗證方式\n\n"
+ "🔎 AwoooP:知識庫 / Runbook Review"
+ )
+ return message[:_CARD_MAX_LEN]
+
class NemotronRunbookGenerator:
"""
@@ -109,7 +198,7 @@ class NemotronRunbookGenerator:
playbook_id=playbook.playbook_id,
)
- await self._push_runbook_review_card(incident, entry.id, content[:200])
+ await self._push_runbook_review_card(incident, entry.id, content)
except Exception as e:
logger.error(
@@ -300,13 +389,7 @@ class NemotronRunbookGenerator:
try:
from src.services.telegram_gateway import get_telegram_gateway
tg = get_telegram_gateway()
- await tg.send_text(
- f"📄 Auto Runbook 待審核\n"
- f"Incident: {incident.incident_id}\n"
- f"Entry ID: {entry_id}\n\n"
- f"{content_preview}...\n\n"
- f"請至知識庫審核並發布。"
- )
+ await tg.send_text(format_runbook_review_card(incident, entry_id, content_preview))
except Exception as e:
logger.warning("runbook_review_card_failed", error=str(e))
diff --git a/apps/api/tests/test_failover_alerter.py b/apps/api/tests/test_failover_alerter.py
index 51bbe407..bcfed64e 100644
--- a/apps/api/tests/test_failover_alerter.py
+++ b/apps/api/tests/test_failover_alerter.py
@@ -22,6 +22,7 @@ from src.services.failover_alerter import (
_lines_from_list,
_sanitize_telegram_error,
configure_alerter,
+ format_governance_alert_card,
get_failover_alerter,
reset_failover_alerter,
)
@@ -249,3 +250,57 @@ def test_sanitize_telegram_error_redacts_bot_token_url() -> None:
assert "SECRET" not in sanitized
assert "botINC-20260506-E54736" in card
+ assert "🧾 內容摘要" in card
+ assert "placeholder 或不支援的執行步驟" in card
+ assert "## 症狀描述" not in card
+ assert "ssh{host}" not in card
+
# =============================================================================
# TestAutoRepairService — fire-and-forget 與 GC 防洩漏