From d8c9e2948575567cd125fd4fee79677ebf32f822 Mon Sep 17 00:00:00 2001 From: OG T Date: Fri, 3 Apr 2026 15:34:34 +0800 Subject: [PATCH] =?UTF-8?q?fix(heartbeat):=20=E6=92=A4=E9=8A=B7=E9=8C=AF?= =?UTF-8?q?=E8=AA=A4=E7=9A=84=20Nemotron=20=E8=87=AA=E5=8B=95=E9=97=9C?= =?UTF-8?q?=E9=96=89=E9=82=8F=E8=BC=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 之前錯誤地在偵測到 Nemotron 慢時自動執行 ENABLE_NEMOTRON_COLLABORATION=false, 這等於自動關掉產品核心功能。 Nemotron NIM 免費 tier 延遲 11-45s 是已知特性(Memory 有記載), 不是需要自動修復的異常。 現在:偵測慢只發告警通知,不執行任何自動修復。 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/telegram_gateway.py | 32 +++++------------------ 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 2da24794..335a3767 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -3024,37 +3024,19 @@ class TelegramGateway: await self.send_notification(text) self._last_message_time = datetime.now(UTC) - # Nemotron 異常時:自動修復 + 標準格式告警 + # Nemotron 異常時:告警通知(不自動關閉,NIM 免費 tier 本來就慢) + # 2026-04-03 ogt: 修正 — 之前錯誤地自動關閉 Nemotron 協作 + # Nemotron 是產品核心,慢(11-45s)是免費 tier 特性,不是需要修復的異常 if not nemo_ok: - fix_action = "kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false -n awoooi-prod" - auto_fixed = False - - # 自動修復: 關閉 Nemotron 協作避免每個 incident 白等 30s - try: - import asyncio - proc = await asyncio.create_subprocess_exec( - "kubectl", "set", "env", "deployment/awoooi-api", - "ENABLE_NEMOTRON_COLLABORATION=false", - "-n", "awoooi-prod", - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - _, stderr = await asyncio.wait_for(proc.communicate(), timeout=15.0) - auto_fixed = proc.returncode == 0 - if not auto_fixed: - logger.error("nemotron_auto_fix_failed", stderr=stderr.decode()[:100]) - except Exception as fix_err: - logger.error("nemotron_auto_fix_error", error=str(fix_err)) - alert = InfraAlertMessage( component="Nemotron NIM (NVIDIA API)", status=nemo_status, - impact="所有 incident Nemotron Tool Calling 將 100% 超時", - auto_fixed=auto_fixed, - fix_action=fix_action, + impact="NIM 回應慢於探測 timeout,incident 分析可能延遲", + auto_fixed=False, + fix_action="無需自動修復,NIM 免費 tier 延遲 11-45s 屬正常範圍", ) await self.send_notification(alert.format(), parse_mode="HTML") - logger.error("nemotron_health_alert_sent", status=nemo_status, auto_fixed=auto_fixed) + logger.warning("nemotron_health_slow_alert", status=nemo_status) logger.info("telegram_heartbeat_sent", nemotron_ok=nemo_ok) return True