diff --git a/apps/api/src/core/config.py b/apps/api/src/core/config.py index ddfcaa11..6cd9bbfc 100644 --- a/apps/api/src/core/config.py +++ b/apps/api/src/core/config.py @@ -345,6 +345,19 @@ class Settings(BaseSettings): default=False, description="Telegram Polling (False: OpenClaw handles it; True: only if OpenClaw unavailable)", ) + # 2026-04-03 ogt: SRE 戰情室群組三頭政治 (Triumvirate) — ADR-053 + OPENCLAW_BOT_TOKEN: str = Field( + default="", + description="@OpenClawAwoooI_Bot Token — 群組內代表 OpenClaw AI 發言", + ) + NEMOTRON_BOT_TOKEN: str = Field( + default="", + description="@NemoTronAwoooI_Bot Token — 群組內代表 NemoClaw AI 發言", + ) + SRE_GROUP_CHAT_ID: str = Field( + default="", + description="AwoooI SRE 戰情室群組 Chat ID", + ) def get_tg_user_whitelist(self) -> list[int]: """Parse comma-separated or JSON array user IDs to list[int]""" diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index f58066cd..83702c01 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -1375,8 +1375,58 @@ class TelegramGateway: message_id=result.get("result", {}).get("message_id"), ) + # 2026-04-03 ogt: 發到 SRE 群組並觸發 AI 雙向討論 (Triumvirate ADR-053) + # 非同步執行,失敗不影響告警主流程 + if settings.SRE_GROUP_CHAT_ID: + asyncio.create_task( + self._send_approval_card_to_group( + approval_id=approval_id, + risk_level=risk_level, + resource_name=resource_name, + root_cause=root_cause, + suggested_action=suggested_action, + ) + ) + return result + async def _send_approval_card_to_group( + self, + approval_id: str, + risk_level: str, + resource_name: str, + root_cause: str, + suggested_action: str, + ) -> None: + """ + 發送告警卡片到 SRE 群組並觸發 AI 討論 + + 由 asyncio.create_task 非同步呼叫,失敗不影響主告警流程。 + """ + try: + risk_emoji = {"critical": "🔴", "medium": "🟡", "low": "🟢"}.get(risk_level, "⚪") + summary = ( + f"{risk_emoji} [{risk_level.upper()}] SRE 告警\n\n" + f"📦 資源: {resource_name}\n" + f"🔍 根因: {root_cause}\n" + f"💡 建議: {suggested_action}\n" + f"🆔 {approval_id}" + ) + group_result = await self.send_to_group(text=summary) + group_msg_id = ( + group_result.get("result", {}).get("message_id") + if group_result.get("ok") + else None + ) + + if group_msg_id: + await self.trigger_group_ai_discussion( + alert_message_id=group_msg_id, + alert_summary=f"[{risk_level.upper()}] 資源: {resource_name}\n根因: {root_cause}\n建議: {suggested_action}", + ) + except Exception as e: + logger.error("send_approval_card_to_group_failed", error=str(e)) + # ========================================================================= # 新訊息發送方法 (2026-03-29 ogt: ADR-038) # ========================================================================= @@ -2470,6 +2520,220 @@ class TelegramGateway: return await self._send_request("sendMessage", payload) + # ========================================================================= + # 2026-04-03 ogt: SRE 戰情室群組三頭政治 (Triumvirate) — ADR-053 + # @tsenyangbot 發告警卡片到群組,OpenClaw/NemoClaw Bot 各自回覆分析 + # ========================================================================= + + async def send_to_group( + self, + text: str, + parse_mode: str = "HTML", + reply_markup: dict | None = None, + ) -> dict: + """ + 用 @tsenyangbot 發訊息到 SRE 群組 (SRE_GROUP_CHAT_ID) + + Args: + text: 訊息內容 + parse_mode: 解析模式 + reply_markup: 按鈕 (可選) + + Returns: + dict: Telegram API 回應 (含 message_id) + """ + if not settings.SRE_GROUP_CHAT_ID: + logger.warning("send_to_group_skipped", reason="SRE_GROUP_CHAT_ID not configured") + return {} + + payload: dict = { + "chat_id": settings.SRE_GROUP_CHAT_ID, + "text": text[:4096], + "parse_mode": parse_mode, + } + if reply_markup: + payload["reply_markup"] = reply_markup + + return await self._send_request("sendMessage", payload) + + async def _send_as_bot( + self, + token: str, + chat_id: str, + text: str, + reply_to_message_id: int | None = None, + parse_mode: str = "HTML", + ) -> dict: + """ + 用指定 Bot Token 發訊息(不走 self._http_client,獨立建立請求) + + Args: + token: Bot Token + chat_id: 群組 Chat ID + text: 訊息內容 + reply_to_message_id: 回覆哪則訊息的 message_id + parse_mode: 解析模式 + + Returns: + dict: Telegram API 回應 + """ + if not self._http_client: + raise TelegramGatewayError("HTTP client not initialized") + + url = f"{self.TELEGRAM_API_BASE}/bot{token}/sendMessage" + payload: dict = { + "chat_id": chat_id, + "text": text[:4096], + "parse_mode": parse_mode, + } + if reply_to_message_id: + payload["reply_to_message_id"] = reply_to_message_id + + response = await self._http_client.post(url, json=payload) + response.raise_for_status() + return response.json() + + async def send_as_openclaw( + self, + text: str, + reply_to_message_id: int | None = None, + ) -> dict: + """ + 用 @OpenClawAwoooI_Bot 在群組發言 + + Args: + text: 訊息內容 + reply_to_message_id: 回覆哪則訊息 + + Returns: + dict: Telegram API 回應 + """ + if not settings.OPENCLAW_BOT_TOKEN or not settings.SRE_GROUP_CHAT_ID: + logger.warning("send_as_openclaw_skipped", reason="OPENCLAW_BOT_TOKEN or SRE_GROUP_CHAT_ID not configured") + return {} + + return await self._send_as_bot( + token=settings.OPENCLAW_BOT_TOKEN, + chat_id=settings.SRE_GROUP_CHAT_ID, + text=text, + reply_to_message_id=reply_to_message_id, + ) + + async def send_as_nemotron( + self, + text: str, + reply_to_message_id: int | None = None, + ) -> dict: + """ + 用 @NemoTronAwoooI_Bot 在群組發言 + + Args: + text: 訊息內容 + reply_to_message_id: 回覆哪則訊息 + + Returns: + dict: Telegram API 回應 + """ + if not settings.NEMOTRON_BOT_TOKEN or not settings.SRE_GROUP_CHAT_ID: + logger.warning("send_as_nemotron_skipped", reason="NEMOTRON_BOT_TOKEN or SRE_GROUP_CHAT_ID not configured") + return {} + + return await self._send_as_bot( + token=settings.NEMOTRON_BOT_TOKEN, + chat_id=settings.SRE_GROUP_CHAT_ID, + text=text, + reply_to_message_id=reply_to_message_id, + ) + + async def trigger_group_ai_discussion( + self, + alert_message_id: int, + alert_summary: str, + ) -> None: + """ + 觸發群組 AI 雙向討論(三頭政治核心流程) + + 流程: + 1. @OpenClawAwoooI_Bot reply 告警訊息,輸出 RCA 分析 + 2. @NemoTronAwoooI_Bot reply OpenClaw 訊息,補充評論 + 3. 完成後停止(避免無限循環) + + 此方法由 asyncio.create_task 非同步呼叫,失敗不影響主流程。 + + Args: + alert_message_id: 告警訊息的 message_id(兩個 Bot 回覆的起點) + alert_summary: 告警摘要文字(提供給 AI 分析用) + """ + try: + from apps.api.src.services.chat_manager import ChatManager # noqa: PLC0415 + except ImportError: + try: + from src.services.chat_manager import ChatManager # noqa: PLC0415 + except ImportError: + logger.error("trigger_group_ai_discussion_failed", reason="Cannot import ChatManager") + return + + try: + chat_mgr = ChatManager() + + # Step 1: OpenClaw 分析告警 + openclaw_prompt = ( + f"你是 OpenClaw,AWOOOI SRE 戰情室的首席 AI 分析師。\n" + f"以下是一則基礎設施告警,請進行 RCA 根因分析並給出 3 點建議行動:\n\n" + f"{alert_summary}" + ) + openclaw_analysis = await chat_mgr._call_openclaw( + system_prompt="你是 OpenClaw,AWOOOI SRE 戰情室首席 AI,精通 K8s、Prometheus、告警分析。", + user_message=openclaw_prompt, + ) + + if not openclaw_analysis: + logger.warning("trigger_group_ai_discussion_openclaw_empty") + return + + openclaw_text = f"🔍 OpenClaw 分析\n\n{openclaw_analysis}" + openclaw_result = await self.send_as_openclaw( + text=openclaw_text, + reply_to_message_id=alert_message_id, + ) + + openclaw_msg_id = ( + openclaw_result.get("result", {}).get("message_id") + if openclaw_result.get("ok") + else None + ) + + logger.info("group_ai_discussion_openclaw_sent", message_id=openclaw_msg_id) + + # Step 2: NemoClaw 補充評論(回覆 OpenClaw 訊息) + nemo_prompt = ( + f"你是 NemoClaw,AWOOOI SRE 戰情室的 NemoClaw AI。\n" + f"OpenClaw 剛剛對以下告警做了分析:\n\n" + f"【原始告警】\n{alert_summary}\n\n" + f"【OpenClaw 分析】\n{openclaw_analysis}\n\n" + f"請從不同角度補充你的觀點,並指出任何可能被忽略的風險點。" + ) + nemo_analysis = await chat_mgr._call_nemotron( + system_prompt="你是 NemoClaw,AWOOOI SRE 戰情室 AI,擅長補充分析與風險評估。", + user_message=nemo_prompt, + ) + + if not nemo_analysis: + logger.warning("trigger_group_ai_discussion_nemo_empty") + return + + nemo_text = f"🤖 NemoClaw 補充\n\n{nemo_analysis}" + await self.send_as_nemotron( + text=nemo_text, + reply_to_message_id=openclaw_msg_id or alert_message_id, + ) + + logger.info("group_ai_discussion_completed", alert_message_id=alert_message_id) + + except Exception as e: + # 群組 AI 討論失敗不影響主流程 + logger.error("trigger_group_ai_discussion_failed", error=str(e)) + async def close(self) -> None: """關閉 Gateway""" # 停止 Long Polling 與 Leader 相關 Tasks diff --git a/k8s/awoooi-prod/03-secrets.yaml b/k8s/awoooi-prod/03-secrets.yaml new file mode 100644 index 00000000..d7b71ab1 --- /dev/null +++ b/k8s/awoooi-prod/03-secrets.yaml @@ -0,0 +1,60 @@ +# AWOOOI 正式環境 Secrets 模板 +# 負責人: CIO / CISO +# 版本: v1.0 +# 日期: 2026-03-20 +# +# ⚠️ 注意: 此檔案為模板,實際值由 CI/CD 或手動注入 +# 實際 Secret 值不應提交到 Git + +apiVersion: v1 +kind: Secret +metadata: + name: awoooi-secrets + namespace: awoooi-prod +type: Opaque +stringData: + # 資料庫連線 (實際值請替換) + # 重要: 必須使用 +asyncpg 驅動 (2026-03-28 K-HA 遷移確認) + DATABASE_URL: "postgresql+asyncpg://awoooi:CHANGE_ME@192.168.0.188:5432/awoooi_prod" + + # Redis 連線 + REDIS_URL: "redis://192.168.0.188:6380/10" + + # JWT 認證 + JWT_SECRET: "CHANGE_ME_TO_RANDOM_STRING" + JWT_ALGORITHM: "HS256" + + # AI 服務 (雲端備援) - ADR-006 v1.3 + ADR-036 + GEMINI_API_KEY: "CHANGE_ME" + CLAUDE_API_KEY: "CHANGE_ME" + # 2026-03-29 ogt: ADR-036 Nemotron Tool Calling (83% 精準度) + NVIDIA_API_KEY: "CHANGE_ME" + + # 通知服務 + SMTP_HOST: "smtp.example.com" + SMTP_USER: "CHANGE_ME" + SMTP_PASSWORD: "CHANGE_ME" + + # Phase 5.5: Telegram Gateway (OpenClaw) + OPENCLAW_TG_BOT_TOKEN: "CHANGE_ME" + OPENCLAW_TG_CHAT_ID: "CHANGE_ME" + OPENCLAW_TG_USER_WHITELIST: "CHANGE_ME" + + # 2026-04-03 ogt: SRE 戰情室群組三頭政治 (Triumvirate ADR-053) + # 實際值由 CD 注入 (kubectl patch secret),此處為佔位 + OPENCLAW_BOT_TOKEN: "CHANGE_ME" + NEMOTRON_BOT_TOKEN: "CHANGE_ME" + + # Webhook 安全 (CISO 要求) + WEBHOOK_HMAC_SECRET: "CHANGE_ME_TO_RANDOM_64_CHARS" + + # ============================================================================ + # Phase 10: Sentry Self-Hosted (192.168.0.110:9000) + # 2026-03-27: 首席架構師審查 - 補齊遺漏配置 + # DSN 格式: http://{public_key}@{host}:{port}/{project_id} + # ============================================================================ + SENTRY_DSN: "CHANGE_ME" + # 2026-03-29 ogt: ADR-037 - Comment 回寫需要 Auth Token + # 取得方式: Sentry UI → Settings → Auth Tokens → Create New Token + # 權限: event:admin, project:read, project:write + SENTRY_AUTH_TOKEN: "CHANGE_ME" diff --git a/k8s/awoooi-prod/04-configmap.yaml b/k8s/awoooi-prod/04-configmap.yaml index 5095ddd4..3d08cc03 100644 --- a/k8s/awoooi-prod/04-configmap.yaml +++ b/k8s/awoooi-prod/04-configmap.yaml @@ -71,6 +71,12 @@ data: # Phase 22.6: 統帥需要直接在同一 Bot 與 OpenClaw/NemoClaw 雙 AI 對話 TELEGRAM_ENABLE_POLLING: "true" + # ============================================================================ + # 2026-04-03 ogt: SRE 戰情室群組三頭政治 (Triumvirate ADR-053) + # OPENCLAW_BOT_TOKEN / NEMOTRON_BOT_TOKEN 在 Secrets 中配置 + # ============================================================================ + SRE_GROUP_CHAT_ID: "-1003711974679" + # 快取 TTL (秒) CACHE_TTL_DASHBOARD: "300" CACHE_TTL_HOST_STATUS: "30"