From 46843c8e1977d61744d2e2f4c80be54339ce6097 Mon Sep 17 00:00:00 2001
From: OG T <ogt@WOOOMacMiniM4.local>
Date: Tue, 31 Mar 2026 13:57:10 +0800
Subject: [PATCH] fix(nvidia): revert to nemotron-mini, truncate context for 4K
 limit, enforce precise confidence

---
 apps/api/models.json                     |  8 ++++----
 apps/api/src/core/prompts.py             |  1 +
 apps/api/src/services/model_registry.py  |  4 ++--
 apps/api/src/services/nvidia_provider.py |  4 ++--
 apps/api/src/services/openclaw.py        | 12 +++++++-----
 5 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/apps/api/models.json b/apps/api/models.json
index cb9f42b1..3e542590 100644
--- a/apps/api/models.json
+++ b/apps/api/models.json
@@ -105,15 +105,15 @@
     },
 
     "nvidia": {
-      "name": "NVIDIA NIM (Llama 3.1 8B / ADR-036)",
+      "name": "NVIDIA Nemotron (ADR-036)",
       "enabled": true,
       "priority": 4,
       "endpoint": "https://integrate.api.nvidia.com/v1",
       "api_path": "/chat/completions",
       "models": {
-        "default": "meta/llama-3.1-8b-instruct",
-        "tool_calling": "meta/llama-3.1-8b-instruct",
-        "rca": "meta/llama-3.1-8b-instruct"
+        "default": "nvidia/nemotron-mini-4b-instruct",
+        "tool_calling": "nvidia/nemotron-mini-4b-instruct",
+        "rca": "nvidia/nemotron-mini-4b-instruct"
       },
       "options": {
         "temperature": 0.0,
diff --git a/apps/api/src/core/prompts.py b/apps/api/src/core/prompts.py
index 0dc3777a..b1a623ab 100644
--- a/apps/api/src/core/prompts.py
+++ b/apps/api/src/core/prompts.py
@@ -63,6 +63,7 @@ For each optimization suggestion, provide EXECUTABLE kubectl commands:
 ## ⚠️ Output Rules
 - You MUST respond with ONLY valid JSON
 - confidence MUST be between 0.0 and 1.0
+- **CRITICAL**: The `confidence` score MUST be mathematically precise and varied (e.g., 0.82, 0.91, 0.77). Do NOT default to generic numbers ending in 5 or 0 like 0.75, 0.80, 0.85. Calculate it strictly based on data evidence.
 - If confidence < 0.70, set primary_responsibility to "COLLAB"
 - optimization_suggestions MUST contain executable kubectl commands
 - Each suggestion needs: type, description, kubectl_or_config (REQUIRED)
diff --git a/apps/api/src/services/model_registry.py b/apps/api/src/services/model_registry.py
index c08e3572..81385113 100644
--- a/apps/api/src/services/model_registry.py
+++ b/apps/api/src/services/model_registry.py
@@ -144,8 +144,8 @@ class ModelRegistry:
                 # 2026-03-29 ogt: P2-3 加入 NVIDIA (ADR-036)
                 "nvidia": {
                     "models": {
-                        "default": "meta/llama-3.1-8b-instruct",
-                        "tool_calling": "meta/llama-3.1-8b-instruct",
+                        "default": "nvidia/nemotron-mini-4b-instruct",
+                        "tool_calling": "nvidia/nemotron-mini-4b-instruct",
                     }
                 },
             },
diff --git a/apps/api/src/services/nvidia_provider.py b/apps/api/src/services/nvidia_provider.py
index 5736b85e..a6c611a4 100644
--- a/apps/api/src/services/nvidia_provider.py
+++ b/apps/api/src/services/nvidia_provider.py
@@ -114,8 +114,8 @@ class INvidiaProvider(Protocol):
 # NVIDIA NIM API Endpoint
 NVIDIA_API_URL = "https://integrate.api.nvidia.com/v1/chat/completions"
 
-# 預設模型 (2026-03-31 ogt: 修正為 128k context 版的 Llama 3.1)
-NVIDIA_DEFAULT_MODEL = "meta/llama-3.1-8b-instruct"
+# 預設模型 (2026-03-31 ogt: 恢復為 nemotron-mini-4b-instruct)
+NVIDIA_DEFAULT_MODEL = "nvidia/nemotron-mini-4b-instruct"
 
 # 請求超時 (秒) - Nemotron 延遲 11-45s
 NVIDIA_TIMEOUT = 60.0
diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py
index 8fae642c..93ea05df 100644
--- a/apps/api/src/services/openclaw.py
+++ b/apps/api/src/services/openclaw.py
@@ -1171,10 +1171,10 @@ Trace URL: {signoz_trace_url}
             - risk_level: 風險等級
             - reasoning: LLM 推理過程
         """
-        # 建構 prompt
+        # 建構 prompt (2026-03-31 ogt: Nemotron-mini context 較小，限制數量與長度)
         signal_summary = "\n".join([
-            f"- {s.get('alert_name', 'unknown')}: {s.get('description', 'N/A')}"
-            for s in signals[:10]  # 最多 10 筆
+            f"- {s.get('alert_name', 'unknown')}: {str(s.get('description', 'N/A'))[:100]}..."
+            for s in signals[:3]  # 最多 3 筆，每筆最多 100 字元
         ])
 
         target = affected_services[0] if affected_services else "unknown-service"
@@ -1199,8 +1199,10 @@ Trace URL: {signoz_trace_url}
             diagnosis_cmds = expert_context.get("suggested_diagnosis_commands", [])
             diagnosis_cmds_str = "\n".join([f"  - `{cmd}`" for cmd in diagnosis_cmds]) if diagnosis_cmds else "  - (無)"
 
-            # ADR-030: 加入完整診斷上下文 (如果有)
-            full_diagnosis = expert_context.get("diagnosis_context", "")
+            # ADR-030: 加入完整診斷上下文 (如果有)，並限制長度以符合 4K Context
+            full_diagnosis = str(expert_context.get("diagnosis_context", ""))[:800]
+            if len(str(expert_context.get("diagnosis_context", ""))) > 800:
+                full_diagnosis += "... (truncated)"
             diagnosis_signals = expert_context.get("diagnosis_signals", [])
             signals_summary = ""
             if diagnosis_signals: