feat(telegram): SRE 戰情室群組三頭政治 Triumvirate (ADR-053)
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 7m6s

- config.py: 新增 OPENCLAW_BOT_TOKEN / NEMOTRON_BOT_TOKEN / SRE_GROUP_CHAT_ID
- telegram_gateway.py: send_to_group / send_as_openclaw / send_as_nemotron / trigger_group_ai_discussion / _send_approval_card_to_group
- send_approval_card 告警發送後非同步觸發群組 AI 雙向討論
- configmap: SRE_GROUP_CHAT_ID=-1003711974679
- secrets: OPENCLAW_BOT_TOKEN / NEMOTRON_BOT_TOKEN CHANGE_ME 佔位

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-03 17:16:05 +08:00
parent ff5a77f7a9
commit c65ed5b1c9
4 changed files with 343 additions and 0 deletions

View File

@@ -345,6 +345,19 @@ class Settings(BaseSettings):
default=False,
description="Telegram Polling (False: OpenClaw handles it; True: only if OpenClaw unavailable)",
)
# 2026-04-03 ogt: SRE 戰情室群組三頭政治 (Triumvirate) — ADR-053
OPENCLAW_BOT_TOKEN: str = Field(
default="",
description="@OpenClawAwoooI_Bot Token — 群組內代表 OpenClaw AI 發言",
)
NEMOTRON_BOT_TOKEN: str = Field(
default="",
description="@NemoTronAwoooI_Bot Token — 群組內代表 NemoClaw AI 發言",
)
SRE_GROUP_CHAT_ID: str = Field(
default="",
description="AwoooI SRE 戰情室群組 Chat ID",
)
def get_tg_user_whitelist(self) -> list[int]:
"""Parse comma-separated or JSON array user IDs to list[int]"""

View File

@@ -1375,8 +1375,58 @@ class TelegramGateway:
message_id=result.get("result", {}).get("message_id"),
)
# 2026-04-03 ogt: 發到 SRE 群組並觸發 AI 雙向討論 (Triumvirate ADR-053)
# 非同步執行,失敗不影響告警主流程
if settings.SRE_GROUP_CHAT_ID:
asyncio.create_task(
self._send_approval_card_to_group(
approval_id=approval_id,
risk_level=risk_level,
resource_name=resource_name,
root_cause=root_cause,
suggested_action=suggested_action,
)
)
return result
async def _send_approval_card_to_group(
self,
approval_id: str,
risk_level: str,
resource_name: str,
root_cause: str,
suggested_action: str,
) -> None:
"""
發送告警卡片到 SRE 群組並觸發 AI 討論
由 asyncio.create_task 非同步呼叫,失敗不影響主告警流程。
"""
try:
risk_emoji = {"critical": "🔴", "medium": "🟡", "low": "🟢"}.get(risk_level, "")
summary = (
f"{risk_emoji} <b>[{risk_level.upper()}] SRE 告警</b>\n\n"
f"📦 資源: <code>{resource_name}</code>\n"
f"🔍 根因: {root_cause}\n"
f"💡 建議: {suggested_action}\n"
f"🆔 <code>{approval_id}</code>"
)
group_result = await self.send_to_group(text=summary)
group_msg_id = (
group_result.get("result", {}).get("message_id")
if group_result.get("ok")
else None
)
if group_msg_id:
await self.trigger_group_ai_discussion(
alert_message_id=group_msg_id,
alert_summary=f"[{risk_level.upper()}] 資源: {resource_name}\n根因: {root_cause}\n建議: {suggested_action}",
)
except Exception as e:
logger.error("send_approval_card_to_group_failed", error=str(e))
# =========================================================================
# 新訊息發送方法 (2026-03-29 ogt: ADR-038)
# =========================================================================
@@ -2470,6 +2520,220 @@ class TelegramGateway:
return await self._send_request("sendMessage", payload)
# =========================================================================
# 2026-04-03 ogt: SRE 戰情室群組三頭政治 (Triumvirate) — ADR-053
# @tsenyangbot 發告警卡片到群組OpenClaw/NemoClaw Bot 各自回覆分析
# =========================================================================
async def send_to_group(
self,
text: str,
parse_mode: str = "HTML",
reply_markup: dict | None = None,
) -> dict:
"""
用 @tsenyangbot 發訊息到 SRE 群組 (SRE_GROUP_CHAT_ID)
Args:
text: 訊息內容
parse_mode: 解析模式
reply_markup: 按鈕 (可選)
Returns:
dict: Telegram API 回應 (含 message_id)
"""
if not settings.SRE_GROUP_CHAT_ID:
logger.warning("send_to_group_skipped", reason="SRE_GROUP_CHAT_ID not configured")
return {}
payload: dict = {
"chat_id": settings.SRE_GROUP_CHAT_ID,
"text": text[:4096],
"parse_mode": parse_mode,
}
if reply_markup:
payload["reply_markup"] = reply_markup
return await self._send_request("sendMessage", payload)
async def _send_as_bot(
self,
token: str,
chat_id: str,
text: str,
reply_to_message_id: int | None = None,
parse_mode: str = "HTML",
) -> dict:
"""
用指定 Bot Token 發訊息(不走 self._http_client獨立建立請求
Args:
token: Bot Token
chat_id: 群組 Chat ID
text: 訊息內容
reply_to_message_id: 回覆哪則訊息的 message_id
parse_mode: 解析模式
Returns:
dict: Telegram API 回應
"""
if not self._http_client:
raise TelegramGatewayError("HTTP client not initialized")
url = f"{self.TELEGRAM_API_BASE}/bot{token}/sendMessage"
payload: dict = {
"chat_id": chat_id,
"text": text[:4096],
"parse_mode": parse_mode,
}
if reply_to_message_id:
payload["reply_to_message_id"] = reply_to_message_id
response = await self._http_client.post(url, json=payload)
response.raise_for_status()
return response.json()
async def send_as_openclaw(
self,
text: str,
reply_to_message_id: int | None = None,
) -> dict:
"""
用 @OpenClawAwoooI_Bot 在群組發言
Args:
text: 訊息內容
reply_to_message_id: 回覆哪則訊息
Returns:
dict: Telegram API 回應
"""
if not settings.OPENCLAW_BOT_TOKEN or not settings.SRE_GROUP_CHAT_ID:
logger.warning("send_as_openclaw_skipped", reason="OPENCLAW_BOT_TOKEN or SRE_GROUP_CHAT_ID not configured")
return {}
return await self._send_as_bot(
token=settings.OPENCLAW_BOT_TOKEN,
chat_id=settings.SRE_GROUP_CHAT_ID,
text=text,
reply_to_message_id=reply_to_message_id,
)
async def send_as_nemotron(
self,
text: str,
reply_to_message_id: int | None = None,
) -> dict:
"""
用 @NemoTronAwoooI_Bot 在群組發言
Args:
text: 訊息內容
reply_to_message_id: 回覆哪則訊息
Returns:
dict: Telegram API 回應
"""
if not settings.NEMOTRON_BOT_TOKEN or not settings.SRE_GROUP_CHAT_ID:
logger.warning("send_as_nemotron_skipped", reason="NEMOTRON_BOT_TOKEN or SRE_GROUP_CHAT_ID not configured")
return {}
return await self._send_as_bot(
token=settings.NEMOTRON_BOT_TOKEN,
chat_id=settings.SRE_GROUP_CHAT_ID,
text=text,
reply_to_message_id=reply_to_message_id,
)
async def trigger_group_ai_discussion(
self,
alert_message_id: int,
alert_summary: str,
) -> None:
"""
觸發群組 AI 雙向討論(三頭政治核心流程)
流程:
1. @OpenClawAwoooI_Bot reply 告警訊息,輸出 RCA 分析
2. @NemoTronAwoooI_Bot reply OpenClaw 訊息,補充評論
3. 完成後停止(避免無限循環)
此方法由 asyncio.create_task 非同步呼叫,失敗不影響主流程。
Args:
alert_message_id: 告警訊息的 message_id兩個 Bot 回覆的起點)
alert_summary: 告警摘要文字(提供給 AI 分析用)
"""
try:
from apps.api.src.services.chat_manager import ChatManager # noqa: PLC0415
except ImportError:
try:
from src.services.chat_manager import ChatManager # noqa: PLC0415
except ImportError:
logger.error("trigger_group_ai_discussion_failed", reason="Cannot import ChatManager")
return
try:
chat_mgr = ChatManager()
# Step 1: OpenClaw 分析告警
openclaw_prompt = (
f"你是 OpenClawAWOOOI SRE 戰情室的首席 AI 分析師。\n"
f"以下是一則基礎設施告警,請進行 RCA 根因分析並給出 3 點建議行動:\n\n"
f"{alert_summary}"
)
openclaw_analysis = await chat_mgr._call_openclaw(
system_prompt="你是 OpenClawAWOOOI SRE 戰情室首席 AI精通 K8s、Prometheus、告警分析。",
user_message=openclaw_prompt,
)
if not openclaw_analysis:
logger.warning("trigger_group_ai_discussion_openclaw_empty")
return
openclaw_text = f"🔍 <b>OpenClaw 分析</b>\n\n{openclaw_analysis}"
openclaw_result = await self.send_as_openclaw(
text=openclaw_text,
reply_to_message_id=alert_message_id,
)
openclaw_msg_id = (
openclaw_result.get("result", {}).get("message_id")
if openclaw_result.get("ok")
else None
)
logger.info("group_ai_discussion_openclaw_sent", message_id=openclaw_msg_id)
# Step 2: NemoClaw 補充評論(回覆 OpenClaw 訊息)
nemo_prompt = (
f"你是 NemoClawAWOOOI SRE 戰情室的 NemoClaw AI。\n"
f"OpenClaw 剛剛對以下告警做了分析:\n\n"
f"【原始告警】\n{alert_summary}\n\n"
f"【OpenClaw 分析】\n{openclaw_analysis}\n\n"
f"請從不同角度補充你的觀點,並指出任何可能被忽略的風險點。"
)
nemo_analysis = await chat_mgr._call_nemotron(
system_prompt="你是 NemoClawAWOOOI SRE 戰情室 AI擅長補充分析與風險評估。",
user_message=nemo_prompt,
)
if not nemo_analysis:
logger.warning("trigger_group_ai_discussion_nemo_empty")
return
nemo_text = f"🤖 <b>NemoClaw 補充</b>\n\n{nemo_analysis}"
await self.send_as_nemotron(
text=nemo_text,
reply_to_message_id=openclaw_msg_id or alert_message_id,
)
logger.info("group_ai_discussion_completed", alert_message_id=alert_message_id)
except Exception as e:
# 群組 AI 討論失敗不影響主流程
logger.error("trigger_group_ai_discussion_failed", error=str(e))
async def close(self) -> None:
"""關閉 Gateway"""
# 停止 Long Polling 與 Leader 相關 Tasks

View File

@@ -0,0 +1,60 @@
# AWOOOI 正式環境 Secrets 模板
# 負責人: CIO / CISO
# 版本: v1.0
# 日期: 2026-03-20
#
# ⚠️ 注意: 此檔案為模板,實際值由 CI/CD 或手動注入
# 實際 Secret 值不應提交到 Git
apiVersion: v1
kind: Secret
metadata:
name: awoooi-secrets
namespace: awoooi-prod
type: Opaque
stringData:
# 資料庫連線 (實際值請替換)
# 重要: 必須使用 +asyncpg 驅動 (2026-03-28 K-HA 遷移確認)
DATABASE_URL: "postgresql+asyncpg://awoooi:CHANGE_ME@192.168.0.188:5432/awoooi_prod"
# Redis 連線
REDIS_URL: "redis://192.168.0.188:6380/10"
# JWT 認證
JWT_SECRET: "CHANGE_ME_TO_RANDOM_STRING"
JWT_ALGORITHM: "HS256"
# AI 服務 (雲端備援) - ADR-006 v1.3 + ADR-036
GEMINI_API_KEY: "CHANGE_ME"
CLAUDE_API_KEY: "CHANGE_ME"
# 2026-03-29 ogt: ADR-036 Nemotron Tool Calling (83% 精準度)
NVIDIA_API_KEY: "CHANGE_ME"
# 通知服務
SMTP_HOST: "smtp.example.com"
SMTP_USER: "CHANGE_ME"
SMTP_PASSWORD: "CHANGE_ME"
# Phase 5.5: Telegram Gateway (OpenClaw)
OPENCLAW_TG_BOT_TOKEN: "CHANGE_ME"
OPENCLAW_TG_CHAT_ID: "CHANGE_ME"
OPENCLAW_TG_USER_WHITELIST: "CHANGE_ME"
# 2026-04-03 ogt: SRE 戰情室群組三頭政治 (Triumvirate ADR-053)
# 實際值由 CD 注入 (kubectl patch secret),此處為佔位
OPENCLAW_BOT_TOKEN: "CHANGE_ME"
NEMOTRON_BOT_TOKEN: "CHANGE_ME"
# Webhook 安全 (CISO 要求)
WEBHOOK_HMAC_SECRET: "CHANGE_ME_TO_RANDOM_64_CHARS"
# ============================================================================
# Phase 10: Sentry Self-Hosted (192.168.0.110:9000)
# 2026-03-27: 首席架構師審查 - 補齊遺漏配置
# DSN 格式: http://{public_key}@{host}:{port}/{project_id}
# ============================================================================
SENTRY_DSN: "CHANGE_ME"
# 2026-03-29 ogt: ADR-037 - Comment 回寫需要 Auth Token
# 取得方式: Sentry UI → Settings → Auth Tokens → Create New Token
# 權限: event:admin, project:read, project:write
SENTRY_AUTH_TOKEN: "CHANGE_ME"

View File

@@ -71,6 +71,12 @@ data:
# Phase 22.6: 統帥需要直接在同一 Bot 與 OpenClaw/NemoClaw 雙 AI 對話
TELEGRAM_ENABLE_POLLING: "true"
# ============================================================================
# 2026-04-03 ogt: SRE 戰情室群組三頭政治 (Triumvirate ADR-053)
# OPENCLAW_BOT_TOKEN / NEMOTRON_BOT_TOKEN 在 Secrets 中配置
# ============================================================================
SRE_GROUP_CHAT_ID: "-1003711974679"
# 快取 TTL (秒)
CACHE_TTL_DASHBOARD: "300"
CACHE_TTL_HOST_STATUS: "30"