feat(monitoring): Phase 20 Nemotron 完整監控整合

服務註冊表:
- 新增 nvidia-nemotron AI 服務
- 3 個 Prometheus metrics 定義
- 4 個告警規則 (circuit_breaker, timeout, error_rate, rate_limit)
- fallback 策略 (nvidia → gemini → ollama)

Alertmanager 規則:
- NvidiaCircuitBreakerOpen (P1)
- NvidiaToolCallingHighLatency (P2)
- NvidiaToolCallingHighErrorRate (P0)
- NvidiaCircuitBreakerHalfOpen (Info)
- NvidiaCircuitBreakerClosed (Info)
- NvidiaNoRequests (P3)

自動修復:
- fallback_to_gemini
- fallback_to_ollama
- switch_model

ADR: ADR-036

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-03-29 02:05:59 +08:00
parent 183776a34f
commit 93c3280481
2 changed files with 193 additions and 0 deletions

View File

@@ -0,0 +1,122 @@
# NVIDIA Nemotron Tool Calling 告警規則
# =================================================
# 版本: v1.0
# 建立日期: 2026-03-29
# ADR: ADR-036
# 用途: 監控 NVIDIA NIM API + Circuit Breaker 狀態
#
# 部署方式:
# kubectl apply -f k8s/monitoring/nvidia-alerts.yaml
# =================================================
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: nvidia-tool-calling-rules
namespace: monitoring
labels:
app: prometheus
release: prometheus
spec:
groups:
# =========================================================================
# NVIDIA Tool Calling 告警群組
# =========================================================================
- name: nvidia_tool_calling
interval: 30s
rules:
# -------------------------------------------------------------------
# Circuit Breaker 斷路告警 (P1)
# -------------------------------------------------------------------
- alert: NvidiaCircuitBreakerOpen
expr: nvidia_circuit_breaker_state_changes_total{to_state="open"} > 0
for: 1m
labels:
severity: warning
service: nvidia-nemotron
owner: ai-team
annotations:
summary: "NVIDIA Circuit Breaker 已斷路"
description: "Circuit Breaker 已切換至 OPEN 狀態API 請求將被拒絕"
runbook: "docs/runbooks/NVIDIA-CIRCUIT-BREAKER.md"
auto_repair: "fallback_to_gemini"
# -------------------------------------------------------------------
# Tool Calling 高延遲告警 (P2)
# -------------------------------------------------------------------
- alert: NvidiaToolCallingHighLatency
expr: histogram_quantile(0.95, nvidia_tool_call_latency_seconds_bucket) > 45
for: 5m
labels:
severity: warning
service: nvidia-nemotron
owner: ai-team
annotations:
summary: "NVIDIA Tool Calling P95 延遲 > 45s"
description: "Tool Calling 請求延遲過高,可能影響用戶體驗"
auto_repair: "switch_model"
# -------------------------------------------------------------------
# Tool Calling 高錯誤率告警 (P0)
# -------------------------------------------------------------------
- alert: NvidiaToolCallingHighErrorRate
expr: |
rate(nvidia_tool_call_requests_total{status="error"}[5m])
/
rate(nvidia_tool_call_requests_total[5m]) > 0.1
for: 5m
labels:
severity: critical
service: nvidia-nemotron
owner: ai-team
annotations:
summary: "NVIDIA Tool Calling 錯誤率 > 10%"
description: "Tool Calling 錯誤率過高,可能是 API 問題或網路問題"
auto_repair: "fallback_to_gemini"
# -------------------------------------------------------------------
# Circuit Breaker Half-Open 恢復通知 (Info)
# -------------------------------------------------------------------
- alert: NvidiaCircuitBreakerHalfOpen
expr: nvidia_circuit_breaker_state_changes_total{to_state="half_open"} > 0
for: 30s
labels:
severity: info
service: nvidia-nemotron
owner: ai-team
annotations:
summary: "NVIDIA Circuit Breaker 正在恢復測試"
description: "Circuit Breaker 進入 HALF_OPEN 狀態,正在測試 API 是否恢復"
# -------------------------------------------------------------------
# Circuit Breaker 恢復通知 (Info)
# -------------------------------------------------------------------
- alert: NvidiaCircuitBreakerClosed
expr: |
increase(nvidia_circuit_breaker_state_changes_total{to_state="closed"}[5m]) > 0
and
nvidia_circuit_breaker_state_changes_total{to_state="open"} > 0
for: 30s
labels:
severity: info
service: nvidia-nemotron
owner: ai-team
annotations:
summary: "NVIDIA Circuit Breaker 已恢復正常"
description: "Circuit Breaker 已從斷路狀態恢復"
# -------------------------------------------------------------------
# 無請求告警 (可能服務異常)
# -------------------------------------------------------------------
- alert: NvidiaNoRequests
expr: |
rate(nvidia_tool_call_requests_total[15m]) == 0
unless on() (kube_pod_container_status_running{pod=~"awoooi-api.*"} == 0)
for: 30m
labels:
severity: warning
service: nvidia-nemotron
owner: ai-team
annotations:
summary: "NVIDIA Tool Calling 30 分鐘內無請求"
description: "可能是整合問題或服務未被使用"

View File

@@ -533,6 +533,36 @@ ai_services:
- inference_timeout
cost_tracking: false
# --- NVIDIA Nemotron Tool Calling (Phase 20) ---
# 2026-03-29 ogt: ADR-036 新增
- name: nvidia-nemotron
type: external
endpoint: https://integrate.api.nvidia.com/v1
model: nvidia/llama-3.1-nemotron-70b-instruct
rate_limit:
requests_per_minute: 100
tokens_per_minute: 200000
features:
- tool_calling
- function_calling
monitoring:
prometheus: true
langfuse: true
otel: true
metrics:
- nvidia_tool_call_requests_total
- nvidia_tool_call_latency_seconds
- nvidia_circuit_breaker_state_changes_total
alerts:
- circuit_breaker_open
- tool_calling_timeout
- high_error_rate
- rate_limit_hit
fallback: gemini
cost_tracking: true
owner: ai-team
criticality: P0
# =============================================================================
# 告警模板 (Alert Templates)
# =============================================================================
@@ -585,6 +615,33 @@ alert_templates:
severity: critical
auto_repair: restart_service
# --- NVIDIA Nemotron 告警 (Phase 20) ---
# 2026-03-29 ogt: ADR-036 新增
circuit_breaker_open:
expr: 'nvidia_circuit_breaker_state_changes_total{to_state="open"} > 0'
for: 1m
severity: warning
auto_repair: fallback_to_gemini
annotations:
summary: 'NVIDIA Circuit Breaker 已斷路,切換至備援'
runbook: 'docs/runbooks/NVIDIA-CIRCUIT-BREAKER.md'
tool_calling_timeout:
expr: 'histogram_quantile(0.95, nvidia_tool_call_latency_seconds_bucket) > 45'
for: 5m
severity: warning
auto_repair: switch_model
annotations:
summary: 'NVIDIA Tool Calling P95 延遲 > 45s'
nvidia_high_error_rate:
expr: 'rate(nvidia_tool_call_requests_total{status="error"}[5m]) / rate(nvidia_tool_call_requests_total[5m]) > 0.1'
for: 5m
severity: critical
auto_repair: fallback_to_gemini
annotations:
summary: 'NVIDIA Tool Calling 錯誤率 > 10%'
# =============================================================================
# 自動修復動作 (Auto-Repair Actions)
# =============================================================================
@@ -624,3 +681,17 @@ auto_repair_actions:
command: 'internal:trigger_memory_analysis'
risk: low
cooldown_minutes: 30
# --- NVIDIA Nemotron 自動修復 (Phase 20) ---
# 2026-03-29 ogt: ADR-036 新增
fallback_to_gemini:
command: 'internal:switch_provider_to_gemini'
risk: low
cooldown_minutes: 5
description: 'NVIDIA API 失敗時自動切換至 Gemini'
fallback_to_ollama:
command: 'internal:switch_provider_to_ollama'
risk: low
cooldown_minutes: 5
description: 'Cloud API 失敗時自動切換至本地 Ollama'