feat(monitoring): Phase 20 Nemotron 完整監控整合
服務註冊表: - 新增 nvidia-nemotron AI 服務 - 3 個 Prometheus metrics 定義 - 4 個告警規則 (circuit_breaker, timeout, error_rate, rate_limit) - fallback 策略 (nvidia → gemini → ollama) Alertmanager 規則: - NvidiaCircuitBreakerOpen (P1) - NvidiaToolCallingHighLatency (P2) - NvidiaToolCallingHighErrorRate (P0) - NvidiaCircuitBreakerHalfOpen (Info) - NvidiaCircuitBreakerClosed (Info) - NvidiaNoRequests (P3) 自動修復: - fallback_to_gemini - fallback_to_ollama - switch_model ADR: ADR-036 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
122
k8s/monitoring/nvidia-alerts.yaml
Normal file
122
k8s/monitoring/nvidia-alerts.yaml
Normal file
@@ -0,0 +1,122 @@
|
||||
# NVIDIA Nemotron Tool Calling 告警規則
|
||||
# =================================================
|
||||
# 版本: v1.0
|
||||
# 建立日期: 2026-03-29
|
||||
# ADR: ADR-036
|
||||
# 用途: 監控 NVIDIA NIM API + Circuit Breaker 狀態
|
||||
#
|
||||
# 部署方式:
|
||||
# kubectl apply -f k8s/monitoring/nvidia-alerts.yaml
|
||||
# =================================================
|
||||
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: nvidia-tool-calling-rules
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: prometheus
|
||||
release: prometheus
|
||||
spec:
|
||||
groups:
|
||||
# =========================================================================
|
||||
# NVIDIA Tool Calling 告警群組
|
||||
# =========================================================================
|
||||
- name: nvidia_tool_calling
|
||||
interval: 30s
|
||||
rules:
|
||||
# -------------------------------------------------------------------
|
||||
# Circuit Breaker 斷路告警 (P1)
|
||||
# -------------------------------------------------------------------
|
||||
- alert: NvidiaCircuitBreakerOpen
|
||||
expr: nvidia_circuit_breaker_state_changes_total{to_state="open"} > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
service: nvidia-nemotron
|
||||
owner: ai-team
|
||||
annotations:
|
||||
summary: "NVIDIA Circuit Breaker 已斷路"
|
||||
description: "Circuit Breaker 已切換至 OPEN 狀態,API 請求將被拒絕"
|
||||
runbook: "docs/runbooks/NVIDIA-CIRCUIT-BREAKER.md"
|
||||
auto_repair: "fallback_to_gemini"
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Tool Calling 高延遲告警 (P2)
|
||||
# -------------------------------------------------------------------
|
||||
- alert: NvidiaToolCallingHighLatency
|
||||
expr: histogram_quantile(0.95, nvidia_tool_call_latency_seconds_bucket) > 45
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: nvidia-nemotron
|
||||
owner: ai-team
|
||||
annotations:
|
||||
summary: "NVIDIA Tool Calling P95 延遲 > 45s"
|
||||
description: "Tool Calling 請求延遲過高,可能影響用戶體驗"
|
||||
auto_repair: "switch_model"
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Tool Calling 高錯誤率告警 (P0)
|
||||
# -------------------------------------------------------------------
|
||||
- alert: NvidiaToolCallingHighErrorRate
|
||||
expr: |
|
||||
rate(nvidia_tool_call_requests_total{status="error"}[5m])
|
||||
/
|
||||
rate(nvidia_tool_call_requests_total[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: nvidia-nemotron
|
||||
owner: ai-team
|
||||
annotations:
|
||||
summary: "NVIDIA Tool Calling 錯誤率 > 10%"
|
||||
description: "Tool Calling 錯誤率過高,可能是 API 問題或網路問題"
|
||||
auto_repair: "fallback_to_gemini"
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Circuit Breaker Half-Open 恢復通知 (Info)
|
||||
# -------------------------------------------------------------------
|
||||
- alert: NvidiaCircuitBreakerHalfOpen
|
||||
expr: nvidia_circuit_breaker_state_changes_total{to_state="half_open"} > 0
|
||||
for: 30s
|
||||
labels:
|
||||
severity: info
|
||||
service: nvidia-nemotron
|
||||
owner: ai-team
|
||||
annotations:
|
||||
summary: "NVIDIA Circuit Breaker 正在恢復測試"
|
||||
description: "Circuit Breaker 進入 HALF_OPEN 狀態,正在測試 API 是否恢復"
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Circuit Breaker 恢復通知 (Info)
|
||||
# -------------------------------------------------------------------
|
||||
- alert: NvidiaCircuitBreakerClosed
|
||||
expr: |
|
||||
increase(nvidia_circuit_breaker_state_changes_total{to_state="closed"}[5m]) > 0
|
||||
and
|
||||
nvidia_circuit_breaker_state_changes_total{to_state="open"} > 0
|
||||
for: 30s
|
||||
labels:
|
||||
severity: info
|
||||
service: nvidia-nemotron
|
||||
owner: ai-team
|
||||
annotations:
|
||||
summary: "NVIDIA Circuit Breaker 已恢復正常"
|
||||
description: "Circuit Breaker 已從斷路狀態恢復"
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# 無請求告警 (可能服務異常)
|
||||
# -------------------------------------------------------------------
|
||||
- alert: NvidiaNoRequests
|
||||
expr: |
|
||||
rate(nvidia_tool_call_requests_total[15m]) == 0
|
||||
unless on() (kube_pod_container_status_running{pod=~"awoooi-api.*"} == 0)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
service: nvidia-nemotron
|
||||
owner: ai-team
|
||||
annotations:
|
||||
summary: "NVIDIA Tool Calling 30 分鐘內無請求"
|
||||
description: "可能是整合問題或服務未被使用"
|
||||
@@ -533,6 +533,36 @@ ai_services:
|
||||
- inference_timeout
|
||||
cost_tracking: false
|
||||
|
||||
# --- NVIDIA Nemotron Tool Calling (Phase 20) ---
|
||||
# 2026-03-29 ogt: ADR-036 新增
|
||||
- name: nvidia-nemotron
|
||||
type: external
|
||||
endpoint: https://integrate.api.nvidia.com/v1
|
||||
model: nvidia/llama-3.1-nemotron-70b-instruct
|
||||
rate_limit:
|
||||
requests_per_minute: 100
|
||||
tokens_per_minute: 200000
|
||||
features:
|
||||
- tool_calling
|
||||
- function_calling
|
||||
monitoring:
|
||||
prometheus: true
|
||||
langfuse: true
|
||||
otel: true
|
||||
metrics:
|
||||
- nvidia_tool_call_requests_total
|
||||
- nvidia_tool_call_latency_seconds
|
||||
- nvidia_circuit_breaker_state_changes_total
|
||||
alerts:
|
||||
- circuit_breaker_open
|
||||
- tool_calling_timeout
|
||||
- high_error_rate
|
||||
- rate_limit_hit
|
||||
fallback: gemini
|
||||
cost_tracking: true
|
||||
owner: ai-team
|
||||
criticality: P0
|
||||
|
||||
# =============================================================================
|
||||
# 告警模板 (Alert Templates)
|
||||
# =============================================================================
|
||||
@@ -585,6 +615,33 @@ alert_templates:
|
||||
severity: critical
|
||||
auto_repair: restart_service
|
||||
|
||||
# --- NVIDIA Nemotron 告警 (Phase 20) ---
|
||||
# 2026-03-29 ogt: ADR-036 新增
|
||||
circuit_breaker_open:
|
||||
expr: 'nvidia_circuit_breaker_state_changes_total{to_state="open"} > 0'
|
||||
for: 1m
|
||||
severity: warning
|
||||
auto_repair: fallback_to_gemini
|
||||
annotations:
|
||||
summary: 'NVIDIA Circuit Breaker 已斷路,切換至備援'
|
||||
runbook: 'docs/runbooks/NVIDIA-CIRCUIT-BREAKER.md'
|
||||
|
||||
tool_calling_timeout:
|
||||
expr: 'histogram_quantile(0.95, nvidia_tool_call_latency_seconds_bucket) > 45'
|
||||
for: 5m
|
||||
severity: warning
|
||||
auto_repair: switch_model
|
||||
annotations:
|
||||
summary: 'NVIDIA Tool Calling P95 延遲 > 45s'
|
||||
|
||||
nvidia_high_error_rate:
|
||||
expr: 'rate(nvidia_tool_call_requests_total{status="error"}[5m]) / rate(nvidia_tool_call_requests_total[5m]) > 0.1'
|
||||
for: 5m
|
||||
severity: critical
|
||||
auto_repair: fallback_to_gemini
|
||||
annotations:
|
||||
summary: 'NVIDIA Tool Calling 錯誤率 > 10%'
|
||||
|
||||
# =============================================================================
|
||||
# 自動修復動作 (Auto-Repair Actions)
|
||||
# =============================================================================
|
||||
@@ -624,3 +681,17 @@ auto_repair_actions:
|
||||
command: 'internal:trigger_memory_analysis'
|
||||
risk: low
|
||||
cooldown_minutes: 30
|
||||
|
||||
# --- NVIDIA Nemotron 自動修復 (Phase 20) ---
|
||||
# 2026-03-29 ogt: ADR-036 新增
|
||||
fallback_to_gemini:
|
||||
command: 'internal:switch_provider_to_gemini'
|
||||
risk: low
|
||||
cooldown_minutes: 5
|
||||
description: 'NVIDIA API 失敗時自動切換至 Gemini'
|
||||
|
||||
fallback_to_ollama:
|
||||
command: 'internal:switch_provider_to_ollama'
|
||||
risk: low
|
||||
cooldown_minutes: 5
|
||||
description: 'Cloud API 失敗時自動切換至本地 Ollama'
|
||||
|
||||
Reference in New Issue
Block a user