diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 2da24794..335a3767 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -3024,37 +3024,19 @@ class TelegramGateway: await self.send_notification(text) self._last_message_time = datetime.now(UTC) - # Nemotron 異常時:自動修復 + 標準格式告警 + # Nemotron 異常時:告警通知(不自動關閉,NIM 免費 tier 本來就慢) + # 2026-04-03 ogt: 修正 — 之前錯誤地自動關閉 Nemotron 協作 + # Nemotron 是產品核心,慢(11-45s)是免費 tier 特性,不是需要修復的異常 if not nemo_ok: - fix_action = "kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false -n awoooi-prod" - auto_fixed = False - - # 自動修復: 關閉 Nemotron 協作避免每個 incident 白等 30s - try: - import asyncio - proc = await asyncio.create_subprocess_exec( - "kubectl", "set", "env", "deployment/awoooi-api", - "ENABLE_NEMOTRON_COLLABORATION=false", - "-n", "awoooi-prod", - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - _, stderr = await asyncio.wait_for(proc.communicate(), timeout=15.0) - auto_fixed = proc.returncode == 0 - if not auto_fixed: - logger.error("nemotron_auto_fix_failed", stderr=stderr.decode()[:100]) - except Exception as fix_err: - logger.error("nemotron_auto_fix_error", error=str(fix_err)) - alert = InfraAlertMessage( component="Nemotron NIM (NVIDIA API)", status=nemo_status, - impact="所有 incident Nemotron Tool Calling 將 100% 超時", - auto_fixed=auto_fixed, - fix_action=fix_action, + impact="NIM 回應慢於探測 timeout,incident 分析可能延遲", + auto_fixed=False, + fix_action="無需自動修復,NIM 免費 tier 延遲 11-45s 屬正常範圍", ) await self.send_notification(alert.format(), parse_mode="HTML") - logger.error("nemotron_health_alert_sent", status=nemo_status, auto_fixed=auto_fixed) + logger.warning("nemotron_health_slow_alert", status=nemo_status) logger.info("telegram_heartbeat_sent", nemotron_ok=nemo_ok) return True