From 93c3280481622e1933ce1fc497e983706d635e7d Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 29 Mar 2026 02:05:59 +0800 Subject: [PATCH] =?UTF-8?q?feat(monitoring):=20Phase=2020=20Nemotron=20?= =?UTF-8?q?=E5=AE=8C=E6=95=B4=E7=9B=A3=E6=8E=A7=E6=95=B4=E5=90=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 服務註冊表: - 新增 nvidia-nemotron AI 服務 - 3 個 Prometheus metrics 定義 - 4 個告警規則 (circuit_breaker, timeout, error_rate, rate_limit) - fallback 策略 (nvidia → gemini → ollama) Alertmanager 規則: - NvidiaCircuitBreakerOpen (P1) - NvidiaToolCallingHighLatency (P2) - NvidiaToolCallingHighErrorRate (P0) - NvidiaCircuitBreakerHalfOpen (Info) - NvidiaCircuitBreakerClosed (Info) - NvidiaNoRequests (P3) 自動修復: - fallback_to_gemini - fallback_to_ollama - switch_model ADR: ADR-036 Co-Authored-By: Claude Opus 4.5 --- k8s/monitoring/nvidia-alerts.yaml | 122 +++++++++++++++++++++++++++ ops/monitoring/service-registry.yaml | 71 ++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 k8s/monitoring/nvidia-alerts.yaml diff --git a/k8s/monitoring/nvidia-alerts.yaml b/k8s/monitoring/nvidia-alerts.yaml new file mode 100644 index 00000000..c6cd1722 --- /dev/null +++ b/k8s/monitoring/nvidia-alerts.yaml @@ -0,0 +1,122 @@ +# NVIDIA Nemotron Tool Calling 告警規則 +# ================================================= +# 版本: v1.0 +# 建立日期: 2026-03-29 +# ADR: ADR-036 +# 用途: 監控 NVIDIA NIM API + Circuit Breaker 狀態 +# +# 部署方式: +# kubectl apply -f k8s/monitoring/nvidia-alerts.yaml +# ================================================= + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: nvidia-tool-calling-rules + namespace: monitoring + labels: + app: prometheus + release: prometheus +spec: + groups: + # ========================================================================= + # NVIDIA Tool Calling 告警群組 + # ========================================================================= + - name: nvidia_tool_calling + interval: 30s + rules: + # ------------------------------------------------------------------- + # Circuit Breaker 斷路告警 (P1) + # ------------------------------------------------------------------- + - alert: NvidiaCircuitBreakerOpen + expr: nvidia_circuit_breaker_state_changes_total{to_state="open"} > 0 + for: 1m + labels: + severity: warning + service: nvidia-nemotron + owner: ai-team + annotations: + summary: "NVIDIA Circuit Breaker 已斷路" + description: "Circuit Breaker 已切換至 OPEN 狀態,API 請求將被拒絕" + runbook: "docs/runbooks/NVIDIA-CIRCUIT-BREAKER.md" + auto_repair: "fallback_to_gemini" + + # ------------------------------------------------------------------- + # Tool Calling 高延遲告警 (P2) + # ------------------------------------------------------------------- + - alert: NvidiaToolCallingHighLatency + expr: histogram_quantile(0.95, nvidia_tool_call_latency_seconds_bucket) > 45 + for: 5m + labels: + severity: warning + service: nvidia-nemotron + owner: ai-team + annotations: + summary: "NVIDIA Tool Calling P95 延遲 > 45s" + description: "Tool Calling 請求延遲過高,可能影響用戶體驗" + auto_repair: "switch_model" + + # ------------------------------------------------------------------- + # Tool Calling 高錯誤率告警 (P0) + # ------------------------------------------------------------------- + - alert: NvidiaToolCallingHighErrorRate + expr: | + rate(nvidia_tool_call_requests_total{status="error"}[5m]) + / + rate(nvidia_tool_call_requests_total[5m]) > 0.1 + for: 5m + labels: + severity: critical + service: nvidia-nemotron + owner: ai-team + annotations: + summary: "NVIDIA Tool Calling 錯誤率 > 10%" + description: "Tool Calling 錯誤率過高,可能是 API 問題或網路問題" + auto_repair: "fallback_to_gemini" + + # ------------------------------------------------------------------- + # Circuit Breaker Half-Open 恢復通知 (Info) + # ------------------------------------------------------------------- + - alert: NvidiaCircuitBreakerHalfOpen + expr: nvidia_circuit_breaker_state_changes_total{to_state="half_open"} > 0 + for: 30s + labels: + severity: info + service: nvidia-nemotron + owner: ai-team + annotations: + summary: "NVIDIA Circuit Breaker 正在恢復測試" + description: "Circuit Breaker 進入 HALF_OPEN 狀態,正在測試 API 是否恢復" + + # ------------------------------------------------------------------- + # Circuit Breaker 恢復通知 (Info) + # ------------------------------------------------------------------- + - alert: NvidiaCircuitBreakerClosed + expr: | + increase(nvidia_circuit_breaker_state_changes_total{to_state="closed"}[5m]) > 0 + and + nvidia_circuit_breaker_state_changes_total{to_state="open"} > 0 + for: 30s + labels: + severity: info + service: nvidia-nemotron + owner: ai-team + annotations: + summary: "NVIDIA Circuit Breaker 已恢復正常" + description: "Circuit Breaker 已從斷路狀態恢復" + + # ------------------------------------------------------------------- + # 無請求告警 (可能服務異常) + # ------------------------------------------------------------------- + - alert: NvidiaNoRequests + expr: | + rate(nvidia_tool_call_requests_total[15m]) == 0 + unless on() (kube_pod_container_status_running{pod=~"awoooi-api.*"} == 0) + for: 30m + labels: + severity: warning + service: nvidia-nemotron + owner: ai-team + annotations: + summary: "NVIDIA Tool Calling 30 分鐘內無請求" + description: "可能是整合問題或服務未被使用" diff --git a/ops/monitoring/service-registry.yaml b/ops/monitoring/service-registry.yaml index be420566..d99c884d 100644 --- a/ops/monitoring/service-registry.yaml +++ b/ops/monitoring/service-registry.yaml @@ -533,6 +533,36 @@ ai_services: - inference_timeout cost_tracking: false + # --- NVIDIA Nemotron Tool Calling (Phase 20) --- + # 2026-03-29 ogt: ADR-036 新增 + - name: nvidia-nemotron + type: external + endpoint: https://integrate.api.nvidia.com/v1 + model: nvidia/llama-3.1-nemotron-70b-instruct + rate_limit: + requests_per_minute: 100 + tokens_per_minute: 200000 + features: + - tool_calling + - function_calling + monitoring: + prometheus: true + langfuse: true + otel: true + metrics: + - nvidia_tool_call_requests_total + - nvidia_tool_call_latency_seconds + - nvidia_circuit_breaker_state_changes_total + alerts: + - circuit_breaker_open + - tool_calling_timeout + - high_error_rate + - rate_limit_hit + fallback: gemini + cost_tracking: true + owner: ai-team + criticality: P0 + # ============================================================================= # 告警模板 (Alert Templates) # ============================================================================= @@ -585,6 +615,33 @@ alert_templates: severity: critical auto_repair: restart_service + # --- NVIDIA Nemotron 告警 (Phase 20) --- + # 2026-03-29 ogt: ADR-036 新增 + circuit_breaker_open: + expr: 'nvidia_circuit_breaker_state_changes_total{to_state="open"} > 0' + for: 1m + severity: warning + auto_repair: fallback_to_gemini + annotations: + summary: 'NVIDIA Circuit Breaker 已斷路,切換至備援' + runbook: 'docs/runbooks/NVIDIA-CIRCUIT-BREAKER.md' + + tool_calling_timeout: + expr: 'histogram_quantile(0.95, nvidia_tool_call_latency_seconds_bucket) > 45' + for: 5m + severity: warning + auto_repair: switch_model + annotations: + summary: 'NVIDIA Tool Calling P95 延遲 > 45s' + + nvidia_high_error_rate: + expr: 'rate(nvidia_tool_call_requests_total{status="error"}[5m]) / rate(nvidia_tool_call_requests_total[5m]) > 0.1' + for: 5m + severity: critical + auto_repair: fallback_to_gemini + annotations: + summary: 'NVIDIA Tool Calling 錯誤率 > 10%' + # ============================================================================= # 自動修復動作 (Auto-Repair Actions) # ============================================================================= @@ -624,3 +681,17 @@ auto_repair_actions: command: 'internal:trigger_memory_analysis' risk: low cooldown_minutes: 30 + + # --- NVIDIA Nemotron 自動修復 (Phase 20) --- + # 2026-03-29 ogt: ADR-036 新增 + fallback_to_gemini: + command: 'internal:switch_provider_to_gemini' + risk: low + cooldown_minutes: 5 + description: 'NVIDIA API 失敗時自動切換至 Gemini' + + fallback_to_ollama: + command: 'internal:switch_provider_to_ollama' + risk: low + cooldown_minutes: 5 + description: 'Cloud API 失敗時自動切換至本地 Ollama'